diff options
Diffstat (limited to 'kernel/cpuset.c')
| -rw-r--r-- | kernel/cpuset.c | 432 |
1 files changed, 225 insertions, 207 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a8..d5ab79cf516d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -54,7 +54,6 @@ | |||
| 54 | #include <asm/uaccess.h> | 54 | #include <asm/uaccess.h> |
| 55 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
| 56 | #include <linux/mutex.h> | 56 | #include <linux/mutex.h> |
| 57 | #include <linux/kfifo.h> | ||
| 58 | #include <linux/workqueue.h> | 57 | #include <linux/workqueue.h> |
| 59 | #include <linux/cgroup.h> | 58 | #include <linux/cgroup.h> |
| 60 | 59 | ||
| @@ -227,10 +226,6 @@ static struct cpuset top_cpuset = { | |||
| 227 | * The task_struct fields mems_allowed and mems_generation may only | 226 | * The task_struct fields mems_allowed and mems_generation may only |
| 228 | * be accessed in the context of that task, so require no locks. | 227 | * be accessed in the context of that task, so require no locks. |
| 229 | * | 228 | * |
| 230 | * The cpuset_common_file_write handler for operations that modify | ||
| 231 | * the cpuset hierarchy holds cgroup_mutex across the entire operation, | ||
| 232 | * single threading all such cpuset modifications across the system. | ||
| 233 | * | ||
| 234 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 229 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
| 235 | * small pieces of code, such as when reading out possibly multi-word | 230 | * small pieces of code, such as when reading out possibly multi-word |
| 236 | * cpumasks and nodemasks. | 231 | * cpumasks and nodemasks. |
| @@ -369,7 +364,7 @@ void cpuset_update_task_memory_state(void) | |||
| 369 | my_cpusets_mem_gen = top_cpuset.mems_generation; | 364 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
| 370 | } else { | 365 | } else { |
| 371 | rcu_read_lock(); | 366 | rcu_read_lock(); |
| 372 | my_cpusets_mem_gen = task_cs(current)->mems_generation; | 367 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; |
| 373 | rcu_read_unlock(); | 368 | rcu_read_unlock(); |
| 374 | } | 369 | } |
| 375 | 370 | ||
| @@ -490,21 +485,51 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
| 490 | static void | 485 | static void |
| 491 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | 486 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) |
| 492 | { | 487 | { |
| 493 | if (!dattr) | ||
| 494 | return; | ||
| 495 | if (dattr->relax_domain_level < c->relax_domain_level) | 488 | if (dattr->relax_domain_level < c->relax_domain_level) |
| 496 | dattr->relax_domain_level = c->relax_domain_level; | 489 | dattr->relax_domain_level = c->relax_domain_level; |
| 497 | return; | 490 | return; |
| 498 | } | 491 | } |
| 499 | 492 | ||
| 493 | static void | ||
| 494 | update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | ||
| 495 | { | ||
| 496 | LIST_HEAD(q); | ||
| 497 | |||
| 498 | list_add(&c->stack_list, &q); | ||
| 499 | while (!list_empty(&q)) { | ||
| 500 | struct cpuset *cp; | ||
| 501 | struct cgroup *cont; | ||
| 502 | struct cpuset *child; | ||
| 503 | |||
| 504 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
| 505 | list_del(q.next); | ||
| 506 | |||
| 507 | if (cpus_empty(cp->cpus_allowed)) | ||
| 508 | continue; | ||
| 509 | |||
| 510 | if (is_sched_load_balance(cp)) | ||
| 511 | update_domain_attr(dattr, cp); | ||
| 512 | |||
| 513 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
| 514 | child = cgroup_cs(cont); | ||
| 515 | list_add_tail(&child->stack_list, &q); | ||
| 516 | } | ||
| 517 | } | ||
| 518 | } | ||
| 519 | |||
| 500 | /* | 520 | /* |
| 501 | * rebuild_sched_domains() | 521 | * rebuild_sched_domains() |
| 502 | * | 522 | * |
| 503 | * If the flag 'sched_load_balance' of any cpuset with non-empty | 523 | * This routine will be called to rebuild the scheduler's dynamic |
| 504 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | 524 | * sched domains: |
| 505 | * which has that flag enabled, or if any cpuset with a non-empty | 525 | * - if the flag 'sched_load_balance' of any cpuset with non-empty |
| 506 | * 'cpus' is removed, then call this routine to rebuild the | 526 | * 'cpus' changes, |
| 507 | * scheduler's dynamic sched domains. | 527 | * - or if the 'cpus' allowed changes in any cpuset which has that |
| 528 | * flag enabled, | ||
| 529 | * - or if the 'sched_relax_domain_level' of any cpuset which has | ||
| 530 | * that flag enabled and with non-empty 'cpus' changes, | ||
| 531 | * - or if any cpuset with non-empty 'cpus' is removed, | ||
| 532 | * - or if a cpu gets offlined. | ||
| 508 | * | 533 | * |
| 509 | * This routine builds a partial partition of the systems CPUs | 534 | * This routine builds a partial partition of the systems CPUs |
| 510 | * (the set of non-overlappping cpumask_t's in the array 'part' | 535 | * (the set of non-overlappping cpumask_t's in the array 'part' |
| @@ -531,7 +556,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 531 | * So the reverse nesting would risk an ABBA deadlock. | 556 | * So the reverse nesting would risk an ABBA deadlock. |
| 532 | * | 557 | * |
| 533 | * The three key local variables below are: | 558 | * The three key local variables below are: |
| 534 | * q - a kfifo queue of cpuset pointers, used to implement a | 559 | * q - a linked-list queue of cpuset pointers, used to implement a |
| 535 | * top-down scan of all cpusets. This scan loads a pointer | 560 | * top-down scan of all cpusets. This scan loads a pointer |
| 536 | * to each cpuset marked is_sched_load_balance into the | 561 | * to each cpuset marked is_sched_load_balance into the |
| 537 | * array 'csa'. For our purposes, rebuilding the schedulers | 562 | * array 'csa'. For our purposes, rebuilding the schedulers |
| @@ -564,9 +589,9 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 564 | * partition_sched_domains(). | 589 | * partition_sched_domains(). |
| 565 | */ | 590 | */ |
| 566 | 591 | ||
| 567 | static void rebuild_sched_domains(void) | 592 | void rebuild_sched_domains(void) |
| 568 | { | 593 | { |
| 569 | struct kfifo *q; /* queue of cpusets to be scanned */ | 594 | LIST_HEAD(q); /* queue of cpusets to be scanned*/ |
| 570 | struct cpuset *cp; /* scans q */ | 595 | struct cpuset *cp; /* scans q */ |
| 571 | struct cpuset **csa; /* array of all cpuset ptrs */ | 596 | struct cpuset **csa; /* array of all cpuset ptrs */ |
| 572 | int csn; /* how many cpuset ptrs in csa so far */ | 597 | int csn; /* how many cpuset ptrs in csa so far */ |
| @@ -576,7 +601,6 @@ static void rebuild_sched_domains(void) | |||
| 576 | int ndoms; /* number of sched domains in result */ | 601 | int ndoms; /* number of sched domains in result */ |
| 577 | int nslot; /* next empty doms[] cpumask_t slot */ | 602 | int nslot; /* next empty doms[] cpumask_t slot */ |
| 578 | 603 | ||
| 579 | q = NULL; | ||
| 580 | csa = NULL; | 604 | csa = NULL; |
| 581 | doms = NULL; | 605 | doms = NULL; |
| 582 | dattr = NULL; | 606 | dattr = NULL; |
| @@ -590,30 +614,42 @@ static void rebuild_sched_domains(void) | |||
| 590 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 614 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
| 591 | if (dattr) { | 615 | if (dattr) { |
| 592 | *dattr = SD_ATTR_INIT; | 616 | *dattr = SD_ATTR_INIT; |
| 593 | update_domain_attr(dattr, &top_cpuset); | 617 | update_domain_attr_tree(dattr, &top_cpuset); |
| 594 | } | 618 | } |
| 595 | *doms = top_cpuset.cpus_allowed; | 619 | *doms = top_cpuset.cpus_allowed; |
| 596 | goto rebuild; | 620 | goto rebuild; |
| 597 | } | 621 | } |
| 598 | 622 | ||
| 599 | q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); | ||
| 600 | if (IS_ERR(q)) | ||
| 601 | goto done; | ||
| 602 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 623 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
| 603 | if (!csa) | 624 | if (!csa) |
| 604 | goto done; | 625 | goto done; |
| 605 | csn = 0; | 626 | csn = 0; |
| 606 | 627 | ||
| 607 | cp = &top_cpuset; | 628 | list_add(&top_cpuset.stack_list, &q); |
| 608 | __kfifo_put(q, (void *)&cp, sizeof(cp)); | 629 | while (!list_empty(&q)) { |
| 609 | while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { | ||
| 610 | struct cgroup *cont; | 630 | struct cgroup *cont; |
| 611 | struct cpuset *child; /* scans child cpusets of cp */ | 631 | struct cpuset *child; /* scans child cpusets of cp */ |
| 612 | if (is_sched_load_balance(cp)) | 632 | |
| 633 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
| 634 | list_del(q.next); | ||
| 635 | |||
| 636 | if (cpus_empty(cp->cpus_allowed)) | ||
| 637 | continue; | ||
| 638 | |||
| 639 | /* | ||
| 640 | * All child cpusets contain a subset of the parent's cpus, so | ||
| 641 | * just skip them, and then we call update_domain_attr_tree() | ||
| 642 | * to calc relax_domain_level of the corresponding sched | ||
| 643 | * domain. | ||
| 644 | */ | ||
| 645 | if (is_sched_load_balance(cp)) { | ||
| 613 | csa[csn++] = cp; | 646 | csa[csn++] = cp; |
| 647 | continue; | ||
| 648 | } | ||
| 649 | |||
| 614 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 650 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
| 615 | child = cgroup_cs(cont); | 651 | child = cgroup_cs(cont); |
| 616 | __kfifo_put(q, (void *)&child, sizeof(cp)); | 652 | list_add_tail(&child->stack_list, &q); |
| 617 | } | 653 | } |
| 618 | } | 654 | } |
| 619 | 655 | ||
| @@ -679,7 +715,9 @@ restart: | |||
| 679 | if (apn == b->pn) { | 715 | if (apn == b->pn) { |
| 680 | cpus_or(*dp, *dp, b->cpus_allowed); | 716 | cpus_or(*dp, *dp, b->cpus_allowed); |
| 681 | b->pn = -1; | 717 | b->pn = -1; |
| 682 | update_domain_attr(dattr, b); | 718 | if (dattr) |
| 719 | update_domain_attr_tree(dattr | ||
| 720 | + nslot, b); | ||
| 683 | } | 721 | } |
| 684 | } | 722 | } |
| 685 | nslot++; | 723 | nslot++; |
| @@ -694,43 +732,11 @@ rebuild: | |||
| 694 | put_online_cpus(); | 732 | put_online_cpus(); |
| 695 | 733 | ||
| 696 | done: | 734 | done: |
| 697 | if (q && !IS_ERR(q)) | ||
| 698 | kfifo_free(q); | ||
| 699 | kfree(csa); | 735 | kfree(csa); |
| 700 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 736 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
| 701 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 737 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ |
| 702 | } | 738 | } |
| 703 | 739 | ||
| 704 | static inline int started_after_time(struct task_struct *t1, | ||
| 705 | struct timespec *time, | ||
| 706 | struct task_struct *t2) | ||
| 707 | { | ||
| 708 | int start_diff = timespec_compare(&t1->start_time, time); | ||
| 709 | if (start_diff > 0) { | ||
| 710 | return 1; | ||
| 711 | } else if (start_diff < 0) { | ||
| 712 | return 0; | ||
| 713 | } else { | ||
| 714 | /* | ||
| 715 | * Arbitrarily, if two processes started at the same | ||
| 716 | * time, we'll say that the lower pointer value | ||
| 717 | * started first. Note that t2 may have exited by now | ||
| 718 | * so this may not be a valid pointer any longer, but | ||
| 719 | * that's fine - it still serves to distinguish | ||
| 720 | * between two tasks started (effectively) | ||
| 721 | * simultaneously. | ||
| 722 | */ | ||
| 723 | return t1 > t2; | ||
| 724 | } | ||
| 725 | } | ||
| 726 | |||
| 727 | static inline int started_after(void *p1, void *p2) | ||
| 728 | { | ||
| 729 | struct task_struct *t1 = p1; | ||
| 730 | struct task_struct *t2 = p2; | ||
| 731 | return started_after_time(t1, &t2->start_time, t2); | ||
| 732 | } | ||
| 733 | |||
| 734 | /** | 740 | /** |
| 735 | * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's | 741 | * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's |
| 736 | * @tsk: task to test | 742 | * @tsk: task to test |
| @@ -766,15 +772,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
| 766 | } | 772 | } |
| 767 | 773 | ||
| 768 | /** | 774 | /** |
| 775 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | ||
| 776 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | ||
| 777 | * | ||
| 778 | * Called with cgroup_mutex held | ||
| 779 | * | ||
| 780 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
| 781 | * calling callback functions for each. | ||
| 782 | * | ||
| 783 | * Return 0 if successful, -errno if not. | ||
| 784 | */ | ||
| 785 | static int update_tasks_cpumask(struct cpuset *cs) | ||
| 786 | { | ||
| 787 | struct cgroup_scanner scan; | ||
| 788 | struct ptr_heap heap; | ||
| 789 | int retval; | ||
| 790 | |||
| 791 | /* | ||
| 792 | * cgroup_scan_tasks() will initialize heap->gt for us. | ||
| 793 | * heap_init() is still needed here for we should not change | ||
| 794 | * cs->cpus_allowed when heap_init() fails. | ||
| 795 | */ | ||
| 796 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
| 797 | if (retval) | ||
| 798 | return retval; | ||
| 799 | |||
| 800 | scan.cg = cs->css.cgroup; | ||
| 801 | scan.test_task = cpuset_test_cpumask; | ||
| 802 | scan.process_task = cpuset_change_cpumask; | ||
| 803 | scan.heap = &heap; | ||
| 804 | retval = cgroup_scan_tasks(&scan); | ||
| 805 | |||
| 806 | heap_free(&heap); | ||
| 807 | return retval; | ||
| 808 | } | ||
| 809 | |||
| 810 | /** | ||
| 769 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 811 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
| 770 | * @cs: the cpuset to consider | 812 | * @cs: the cpuset to consider |
| 771 | * @buf: buffer of cpu numbers written to this cpuset | 813 | * @buf: buffer of cpu numbers written to this cpuset |
| 772 | */ | 814 | */ |
| 773 | static int update_cpumask(struct cpuset *cs, char *buf) | 815 | static int update_cpumask(struct cpuset *cs, const char *buf) |
| 774 | { | 816 | { |
| 775 | struct cpuset trialcs; | 817 | struct cpuset trialcs; |
| 776 | struct cgroup_scanner scan; | ||
| 777 | struct ptr_heap heap; | ||
| 778 | int retval; | 818 | int retval; |
| 779 | int is_load_balanced; | 819 | int is_load_balanced; |
| 780 | 820 | ||
| @@ -790,7 +830,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 790 | * that parsing. The validate_change() call ensures that cpusets | 830 | * that parsing. The validate_change() call ensures that cpusets |
| 791 | * with tasks have cpus. | 831 | * with tasks have cpus. |
| 792 | */ | 832 | */ |
| 793 | buf = strstrip(buf); | ||
| 794 | if (!*buf) { | 833 | if (!*buf) { |
| 795 | cpus_clear(trialcs.cpus_allowed); | 834 | cpus_clear(trialcs.cpus_allowed); |
| 796 | } else { | 835 | } else { |
| @@ -809,10 +848,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 809 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 848 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) |
| 810 | return 0; | 849 | return 0; |
| 811 | 850 | ||
| 812 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
| 813 | if (retval) | ||
| 814 | return retval; | ||
| 815 | |||
| 816 | is_load_balanced = is_sched_load_balance(&trialcs); | 851 | is_load_balanced = is_sched_load_balance(&trialcs); |
| 817 | 852 | ||
| 818 | mutex_lock(&callback_mutex); | 853 | mutex_lock(&callback_mutex); |
| @@ -823,12 +858,9 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 823 | * Scan tasks in the cpuset, and update the cpumasks of any | 858 | * Scan tasks in the cpuset, and update the cpumasks of any |
| 824 | * that need an update. | 859 | * that need an update. |
| 825 | */ | 860 | */ |
| 826 | scan.cg = cs->css.cgroup; | 861 | retval = update_tasks_cpumask(cs); |
| 827 | scan.test_task = cpuset_test_cpumask; | 862 | if (retval < 0) |
| 828 | scan.process_task = cpuset_change_cpumask; | 863 | return retval; |
| 829 | scan.heap = &heap; | ||
| 830 | cgroup_scan_tasks(&scan); | ||
| 831 | heap_free(&heap); | ||
| 832 | 864 | ||
| 833 | if (is_load_balanced) | 865 | if (is_load_balanced) |
| 834 | rebuild_sched_domains(); | 866 | rebuild_sched_domains(); |
| @@ -884,74 +916,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
| 884 | mutex_unlock(&callback_mutex); | 916 | mutex_unlock(&callback_mutex); |
| 885 | } | 917 | } |
| 886 | 918 | ||
| 887 | /* | ||
| 888 | * Handle user request to change the 'mems' memory placement | ||
| 889 | * of a cpuset. Needs to validate the request, update the | ||
| 890 | * cpusets mems_allowed and mems_generation, and for each | ||
| 891 | * task in the cpuset, rebind any vma mempolicies and if | ||
| 892 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
| 893 | * pages to the new memory. | ||
| 894 | * | ||
| 895 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
| 896 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
| 897 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
| 898 | * their mempolicies to the cpusets new mems_allowed. | ||
| 899 | */ | ||
| 900 | |||
| 901 | static void *cpuset_being_rebound; | 919 | static void *cpuset_being_rebound; |
| 902 | 920 | ||
| 903 | static int update_nodemask(struct cpuset *cs, char *buf) | 921 | /** |
| 922 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | ||
| 923 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | ||
| 924 | * @oldmem: old mems_allowed of cpuset cs | ||
| 925 | * | ||
| 926 | * Called with cgroup_mutex held | ||
| 927 | * Return 0 if successful, -errno if not. | ||
| 928 | */ | ||
| 929 | static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) | ||
| 904 | { | 930 | { |
| 905 | struct cpuset trialcs; | ||
| 906 | nodemask_t oldmem; | ||
| 907 | struct task_struct *p; | 931 | struct task_struct *p; |
| 908 | struct mm_struct **mmarray; | 932 | struct mm_struct **mmarray; |
| 909 | int i, n, ntasks; | 933 | int i, n, ntasks; |
| 910 | int migrate; | 934 | int migrate; |
| 911 | int fudge; | 935 | int fudge; |
| 912 | int retval; | ||
| 913 | struct cgroup_iter it; | 936 | struct cgroup_iter it; |
| 914 | 937 | int retval; | |
| 915 | /* | ||
| 916 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
| 917 | * it's read-only | ||
| 918 | */ | ||
| 919 | if (cs == &top_cpuset) | ||
| 920 | return -EACCES; | ||
| 921 | |||
| 922 | trialcs = *cs; | ||
| 923 | |||
| 924 | /* | ||
| 925 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
| 926 | * Since nodelist_parse() fails on an empty mask, we special case | ||
| 927 | * that parsing. The validate_change() call ensures that cpusets | ||
| 928 | * with tasks have memory. | ||
| 929 | */ | ||
| 930 | buf = strstrip(buf); | ||
| 931 | if (!*buf) { | ||
| 932 | nodes_clear(trialcs.mems_allowed); | ||
| 933 | } else { | ||
| 934 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
| 935 | if (retval < 0) | ||
| 936 | goto done; | ||
| 937 | |||
| 938 | if (!nodes_subset(trialcs.mems_allowed, | ||
| 939 | node_states[N_HIGH_MEMORY])) | ||
| 940 | return -EINVAL; | ||
| 941 | } | ||
| 942 | oldmem = cs->mems_allowed; | ||
| 943 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
| 944 | retval = 0; /* Too easy - nothing to do */ | ||
| 945 | goto done; | ||
| 946 | } | ||
| 947 | retval = validate_change(cs, &trialcs); | ||
| 948 | if (retval < 0) | ||
| 949 | goto done; | ||
| 950 | |||
| 951 | mutex_lock(&callback_mutex); | ||
| 952 | cs->mems_allowed = trialcs.mems_allowed; | ||
| 953 | cs->mems_generation = cpuset_mems_generation++; | ||
| 954 | mutex_unlock(&callback_mutex); | ||
| 955 | 938 | ||
| 956 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 939 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
| 957 | 940 | ||
| @@ -1018,7 +1001,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 1018 | 1001 | ||
| 1019 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1002 | mpol_rebind_mm(mm, &cs->mems_allowed); |
| 1020 | if (migrate) | 1003 | if (migrate) |
| 1021 | cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); | 1004 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); |
| 1022 | mmput(mm); | 1005 | mmput(mm); |
| 1023 | } | 1006 | } |
| 1024 | 1007 | ||
| @@ -1030,6 +1013,70 @@ done: | |||
| 1030 | return retval; | 1013 | return retval; |
| 1031 | } | 1014 | } |
| 1032 | 1015 | ||
| 1016 | /* | ||
| 1017 | * Handle user request to change the 'mems' memory placement | ||
| 1018 | * of a cpuset. Needs to validate the request, update the | ||
| 1019 | * cpusets mems_allowed and mems_generation, and for each | ||
| 1020 | * task in the cpuset, rebind any vma mempolicies and if | ||
| 1021 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
| 1022 | * pages to the new memory. | ||
| 1023 | * | ||
| 1024 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
| 1025 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
| 1026 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
| 1027 | * their mempolicies to the cpusets new mems_allowed. | ||
| 1028 | */ | ||
| 1029 | static int update_nodemask(struct cpuset *cs, const char *buf) | ||
| 1030 | { | ||
| 1031 | struct cpuset trialcs; | ||
| 1032 | nodemask_t oldmem; | ||
| 1033 | int retval; | ||
| 1034 | |||
| 1035 | /* | ||
| 1036 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
| 1037 | * it's read-only | ||
| 1038 | */ | ||
| 1039 | if (cs == &top_cpuset) | ||
| 1040 | return -EACCES; | ||
| 1041 | |||
| 1042 | trialcs = *cs; | ||
| 1043 | |||
| 1044 | /* | ||
| 1045 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
| 1046 | * Since nodelist_parse() fails on an empty mask, we special case | ||
| 1047 | * that parsing. The validate_change() call ensures that cpusets | ||
| 1048 | * with tasks have memory. | ||
| 1049 | */ | ||
| 1050 | if (!*buf) { | ||
| 1051 | nodes_clear(trialcs.mems_allowed); | ||
| 1052 | } else { | ||
| 1053 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
| 1054 | if (retval < 0) | ||
| 1055 | goto done; | ||
| 1056 | |||
| 1057 | if (!nodes_subset(trialcs.mems_allowed, | ||
| 1058 | node_states[N_HIGH_MEMORY])) | ||
| 1059 | return -EINVAL; | ||
| 1060 | } | ||
| 1061 | oldmem = cs->mems_allowed; | ||
| 1062 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
| 1063 | retval = 0; /* Too easy - nothing to do */ | ||
| 1064 | goto done; | ||
| 1065 | } | ||
| 1066 | retval = validate_change(cs, &trialcs); | ||
| 1067 | if (retval < 0) | ||
| 1068 | goto done; | ||
| 1069 | |||
| 1070 | mutex_lock(&callback_mutex); | ||
| 1071 | cs->mems_allowed = trialcs.mems_allowed; | ||
| 1072 | cs->mems_generation = cpuset_mems_generation++; | ||
| 1073 | mutex_unlock(&callback_mutex); | ||
| 1074 | |||
| 1075 | retval = update_tasks_nodemask(cs, &oldmem); | ||
| 1076 | done: | ||
| 1077 | return retval; | ||
| 1078 | } | ||
| 1079 | |||
| 1033 | int current_cpuset_is_being_rebound(void) | 1080 | int current_cpuset_is_being_rebound(void) |
| 1034 | { | 1081 | { |
| 1035 | return task_cs(current) == cpuset_being_rebound; | 1082 | return task_cs(current) == cpuset_being_rebound; |
| @@ -1042,7 +1089,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
| 1042 | 1089 | ||
| 1043 | if (val != cs->relax_domain_level) { | 1090 | if (val != cs->relax_domain_level) { |
| 1044 | cs->relax_domain_level = val; | 1091 | cs->relax_domain_level = val; |
| 1045 | rebuild_sched_domains(); | 1092 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
| 1093 | rebuild_sched_domains(); | ||
| 1046 | } | 1094 | } |
| 1047 | 1095 | ||
| 1048 | return 0; | 1096 | return 0; |
| @@ -1254,72 +1302,14 @@ typedef enum { | |||
| 1254 | FILE_SPREAD_SLAB, | 1302 | FILE_SPREAD_SLAB, |
| 1255 | } cpuset_filetype_t; | 1303 | } cpuset_filetype_t; |
| 1256 | 1304 | ||
| 1257 | static ssize_t cpuset_common_file_write(struct cgroup *cont, | ||
| 1258 | struct cftype *cft, | ||
| 1259 | struct file *file, | ||
| 1260 | const char __user *userbuf, | ||
| 1261 | size_t nbytes, loff_t *unused_ppos) | ||
| 1262 | { | ||
| 1263 | struct cpuset *cs = cgroup_cs(cont); | ||
| 1264 | cpuset_filetype_t type = cft->private; | ||
| 1265 | char *buffer; | ||
| 1266 | int retval = 0; | ||
| 1267 | |||
| 1268 | /* Crude upper limit on largest legitimate cpulist user might write. */ | ||
| 1269 | if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES)) | ||
| 1270 | return -E2BIG; | ||
| 1271 | |||
| 1272 | /* +1 for nul-terminator */ | ||
| 1273 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | ||
| 1274 | if (!buffer) | ||
| 1275 | return -ENOMEM; | ||
| 1276 | |||
| 1277 | if (copy_from_user(buffer, userbuf, nbytes)) { | ||
| 1278 | retval = -EFAULT; | ||
| 1279 | goto out1; | ||
| 1280 | } | ||
| 1281 | buffer[nbytes] = 0; /* nul-terminate */ | ||
| 1282 | |||
| 1283 | cgroup_lock(); | ||
| 1284 | |||
| 1285 | if (cgroup_is_removed(cont)) { | ||
| 1286 | retval = -ENODEV; | ||
| 1287 | goto out2; | ||
| 1288 | } | ||
| 1289 | |||
| 1290 | switch (type) { | ||
| 1291 | case FILE_CPULIST: | ||
| 1292 | retval = update_cpumask(cs, buffer); | ||
| 1293 | break; | ||
| 1294 | case FILE_MEMLIST: | ||
| 1295 | retval = update_nodemask(cs, buffer); | ||
| 1296 | break; | ||
| 1297 | default: | ||
| 1298 | retval = -EINVAL; | ||
| 1299 | goto out2; | ||
| 1300 | } | ||
| 1301 | |||
| 1302 | if (retval == 0) | ||
| 1303 | retval = nbytes; | ||
| 1304 | out2: | ||
| 1305 | cgroup_unlock(); | ||
| 1306 | out1: | ||
| 1307 | kfree(buffer); | ||
| 1308 | return retval; | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1305 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
| 1312 | { | 1306 | { |
| 1313 | int retval = 0; | 1307 | int retval = 0; |
| 1314 | struct cpuset *cs = cgroup_cs(cgrp); | 1308 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1315 | cpuset_filetype_t type = cft->private; | 1309 | cpuset_filetype_t type = cft->private; |
| 1316 | 1310 | ||
| 1317 | cgroup_lock(); | 1311 | if (!cgroup_lock_live_group(cgrp)) |
| 1318 | |||
| 1319 | if (cgroup_is_removed(cgrp)) { | ||
| 1320 | cgroup_unlock(); | ||
| 1321 | return -ENODEV; | 1312 | return -ENODEV; |
| 1322 | } | ||
| 1323 | 1313 | ||
| 1324 | switch (type) { | 1314 | switch (type) { |
| 1325 | case FILE_CPU_EXCLUSIVE: | 1315 | case FILE_CPU_EXCLUSIVE: |
| @@ -1365,12 +1355,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
| 1365 | struct cpuset *cs = cgroup_cs(cgrp); | 1355 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1366 | cpuset_filetype_t type = cft->private; | 1356 | cpuset_filetype_t type = cft->private; |
| 1367 | 1357 | ||
| 1368 | cgroup_lock(); | 1358 | if (!cgroup_lock_live_group(cgrp)) |
| 1369 | |||
| 1370 | if (cgroup_is_removed(cgrp)) { | ||
| 1371 | cgroup_unlock(); | ||
| 1372 | return -ENODEV; | 1359 | return -ENODEV; |
| 1373 | } | 1360 | |
| 1374 | switch (type) { | 1361 | switch (type) { |
| 1375 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1362 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
| 1376 | retval = update_relax_domain_level(cs, val); | 1363 | retval = update_relax_domain_level(cs, val); |
| @@ -1384,6 +1371,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
| 1384 | } | 1371 | } |
| 1385 | 1372 | ||
| 1386 | /* | 1373 | /* |
| 1374 | * Common handling for a write to a "cpus" or "mems" file. | ||
| 1375 | */ | ||
| 1376 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | ||
| 1377 | const char *buf) | ||
| 1378 | { | ||
| 1379 | int retval = 0; | ||
| 1380 | |||
| 1381 | if (!cgroup_lock_live_group(cgrp)) | ||
| 1382 | return -ENODEV; | ||
| 1383 | |||
| 1384 | switch (cft->private) { | ||
| 1385 | case FILE_CPULIST: | ||
| 1386 | retval = update_cpumask(cgroup_cs(cgrp), buf); | ||
| 1387 | break; | ||
| 1388 | case FILE_MEMLIST: | ||
| 1389 | retval = update_nodemask(cgroup_cs(cgrp), buf); | ||
| 1390 | break; | ||
| 1391 | default: | ||
| 1392 | retval = -EINVAL; | ||
| 1393 | break; | ||
| 1394 | } | ||
| 1395 | cgroup_unlock(); | ||
| 1396 | return retval; | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | /* | ||
| 1387 | * These ascii lists should be read in a single call, by using a user | 1400 | * These ascii lists should be read in a single call, by using a user |
| 1388 | * buffer large enough to hold the entire map. If read in smaller | 1401 | * buffer large enough to hold the entire map. If read in smaller |
| 1389 | * chunks, there is no guarantee of atomicity. Since the display format | 1402 | * chunks, there is no guarantee of atomicity. Since the display format |
| @@ -1502,14 +1515,16 @@ static struct cftype files[] = { | |||
| 1502 | { | 1515 | { |
| 1503 | .name = "cpus", | 1516 | .name = "cpus", |
| 1504 | .read = cpuset_common_file_read, | 1517 | .read = cpuset_common_file_read, |
| 1505 | .write = cpuset_common_file_write, | 1518 | .write_string = cpuset_write_resmask, |
| 1519 | .max_write_len = (100U + 6 * NR_CPUS), | ||
| 1506 | .private = FILE_CPULIST, | 1520 | .private = FILE_CPULIST, |
| 1507 | }, | 1521 | }, |
| 1508 | 1522 | ||
| 1509 | { | 1523 | { |
| 1510 | .name = "mems", | 1524 | .name = "mems", |
| 1511 | .read = cpuset_common_file_read, | 1525 | .read = cpuset_common_file_read, |
| 1512 | .write = cpuset_common_file_write, | 1526 | .write_string = cpuset_write_resmask, |
| 1527 | .max_write_len = (100U + 6 * MAX_NUMNODES), | ||
| 1513 | .private = FILE_MEMLIST, | 1528 | .private = FILE_MEMLIST, |
| 1514 | }, | 1529 | }, |
| 1515 | 1530 | ||
| @@ -1790,7 +1805,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
| 1790 | scan.scan.heap = NULL; | 1805 | scan.scan.heap = NULL; |
| 1791 | scan.to = to->css.cgroup; | 1806 | scan.to = to->css.cgroup; |
| 1792 | 1807 | ||
| 1793 | if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) | 1808 | if (cgroup_scan_tasks(&scan.scan)) |
| 1794 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | 1809 | printk(KERN_ERR "move_member_tasks_to_cpuset: " |
| 1795 | "cgroup_scan_tasks failed\n"); | 1810 | "cgroup_scan_tasks failed\n"); |
| 1796 | } | 1811 | } |
| @@ -1846,29 +1861,29 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
| 1846 | */ | 1861 | */ |
| 1847 | static void scan_for_empty_cpusets(const struct cpuset *root) | 1862 | static void scan_for_empty_cpusets(const struct cpuset *root) |
| 1848 | { | 1863 | { |
| 1864 | LIST_HEAD(queue); | ||
| 1849 | struct cpuset *cp; /* scans cpusets being updated */ | 1865 | struct cpuset *cp; /* scans cpusets being updated */ |
| 1850 | struct cpuset *child; /* scans child cpusets of cp */ | 1866 | struct cpuset *child; /* scans child cpusets of cp */ |
| 1851 | struct list_head queue; | ||
| 1852 | struct cgroup *cont; | 1867 | struct cgroup *cont; |
| 1853 | 1868 | nodemask_t oldmems; | |
| 1854 | INIT_LIST_HEAD(&queue); | ||
| 1855 | 1869 | ||
| 1856 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 1870 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
| 1857 | 1871 | ||
| 1858 | while (!list_empty(&queue)) { | 1872 | while (!list_empty(&queue)) { |
| 1859 | cp = container_of(queue.next, struct cpuset, stack_list); | 1873 | cp = list_first_entry(&queue, struct cpuset, stack_list); |
| 1860 | list_del(queue.next); | 1874 | list_del(queue.next); |
| 1861 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 1875 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
| 1862 | child = cgroup_cs(cont); | 1876 | child = cgroup_cs(cont); |
| 1863 | list_add_tail(&child->stack_list, &queue); | 1877 | list_add_tail(&child->stack_list, &queue); |
| 1864 | } | 1878 | } |
| 1865 | cont = cp->css.cgroup; | ||
| 1866 | 1879 | ||
| 1867 | /* Continue past cpusets with all cpus, mems online */ | 1880 | /* Continue past cpusets with all cpus, mems online */ |
| 1868 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && | 1881 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && |
| 1869 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 1882 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
| 1870 | continue; | 1883 | continue; |
| 1871 | 1884 | ||
| 1885 | oldmems = cp->mems_allowed; | ||
| 1886 | |||
| 1872 | /* Remove offline cpus and mems from this cpuset. */ | 1887 | /* Remove offline cpus and mems from this cpuset. */ |
| 1873 | mutex_lock(&callback_mutex); | 1888 | mutex_lock(&callback_mutex); |
| 1874 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | 1889 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); |
| @@ -1880,6 +1895,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
| 1880 | if (cpus_empty(cp->cpus_allowed) || | 1895 | if (cpus_empty(cp->cpus_allowed) || |
| 1881 | nodes_empty(cp->mems_allowed)) | 1896 | nodes_empty(cp->mems_allowed)) |
| 1882 | remove_tasks_in_empty_cpuset(cp); | 1897 | remove_tasks_in_empty_cpuset(cp); |
| 1898 | else { | ||
| 1899 | update_tasks_cpumask(cp); | ||
| 1900 | update_tasks_nodemask(cp, &oldmems); | ||
| 1901 | } | ||
| 1883 | } | 1902 | } |
| 1884 | } | 1903 | } |
| 1885 | 1904 | ||
| @@ -1972,7 +1991,6 @@ void __init cpuset_init_smp(void) | |||
| 1972 | } | 1991 | } |
| 1973 | 1992 | ||
| 1974 | /** | 1993 | /** |
| 1975 | |||
| 1976 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 1994 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
| 1977 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 1995 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
| 1978 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | 1996 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. |
