diff options
Diffstat (limited to 'kernel/cpuset.c')
| -rw-r--r-- | kernel/cpuset.c | 500 |
1 files changed, 301 insertions, 199 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe5..ea1966db34f2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -59,6 +59,7 @@ | |||
| 59 | #include <linux/mutex.h> | 59 | #include <linux/mutex.h> |
| 60 | #include <linux/workqueue.h> | 60 | #include <linux/workqueue.h> |
| 61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
| 62 | #include <linux/wait.h> | ||
| 62 | 63 | ||
| 63 | /* | 64 | /* |
| 64 | * Tracks how many cpusets are currently defined in system. | 65 | * Tracks how many cpusets are currently defined in system. |
| @@ -87,6 +88,18 @@ struct cpuset { | |||
| 87 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 88 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
| 88 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 89 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
| 89 | 90 | ||
| 91 | /* | ||
| 92 | * This is old Memory Nodes tasks took on. | ||
| 93 | * | ||
| 94 | * - top_cpuset.old_mems_allowed is initialized to mems_allowed. | ||
| 95 | * - A new cpuset's old_mems_allowed is initialized when some | ||
| 96 | * task is moved into it. | ||
| 97 | * - old_mems_allowed is used in cpuset_migrate_mm() when we change | ||
| 98 | * cpuset.mems_allowed and have tasks' nodemask updated, and | ||
| 99 | * then old_mems_allowed is updated to mems_allowed. | ||
| 100 | */ | ||
| 101 | nodemask_t old_mems_allowed; | ||
| 102 | |||
| 90 | struct fmeter fmeter; /* memory_pressure filter */ | 103 | struct fmeter fmeter; /* memory_pressure filter */ |
| 91 | 104 | ||
| 92 | /* | 105 | /* |
| @@ -100,14 +113,12 @@ struct cpuset { | |||
| 100 | 113 | ||
| 101 | /* for custom sched domain */ | 114 | /* for custom sched domain */ |
| 102 | int relax_domain_level; | 115 | int relax_domain_level; |
| 103 | |||
| 104 | struct work_struct hotplug_work; | ||
| 105 | }; | 116 | }; |
| 106 | 117 | ||
| 107 | /* Retrieve the cpuset for a cgroup */ | 118 | /* Retrieve the cpuset for a cgroup */ |
| 108 | static inline struct cpuset *cgroup_cs(struct cgroup *cont) | 119 | static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) |
| 109 | { | 120 | { |
| 110 | return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), | 121 | return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), |
| 111 | struct cpuset, css); | 122 | struct cpuset, css); |
| 112 | } | 123 | } |
| 113 | 124 | ||
| @@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex); | |||
| 267 | /* | 278 | /* |
| 268 | * CPU / memory hotplug is handled asynchronously. | 279 | * CPU / memory hotplug is handled asynchronously. |
| 269 | */ | 280 | */ |
| 270 | static struct workqueue_struct *cpuset_propagate_hotplug_wq; | ||
| 271 | |||
| 272 | static void cpuset_hotplug_workfn(struct work_struct *work); | 281 | static void cpuset_hotplug_workfn(struct work_struct *work); |
| 273 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work); | ||
| 274 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); | ||
| 275 | |||
| 276 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); | 282 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); |
| 277 | 283 | ||
| 284 | static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); | ||
| 285 | |||
| 278 | /* | 286 | /* |
| 279 | * This is ugly, but preserves the userspace API for existing cpuset | 287 | * This is ugly, but preserves the userspace API for existing cpuset |
| 280 | * users. If someone tries to mount the "cpuset" filesystem, we | 288 | * users. If someone tries to mount the "cpuset" filesystem, we |
| @@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = { | |||
| 304 | /* | 312 | /* |
| 305 | * Return in pmask the portion of a cpusets's cpus_allowed that | 313 | * Return in pmask the portion of a cpusets's cpus_allowed that |
| 306 | * are online. If none are online, walk up the cpuset hierarchy | 314 | * are online. If none are online, walk up the cpuset hierarchy |
| 307 | * until we find one that does have some online cpus. If we get | 315 | * until we find one that does have some online cpus. The top |
| 308 | * all the way to the top and still haven't found any online cpus, | 316 | * cpuset always has some cpus online. |
| 309 | * return cpu_online_mask. Or if passed a NULL cs from an exit'ing | ||
| 310 | * task, return cpu_online_mask. | ||
| 311 | * | 317 | * |
| 312 | * One way or another, we guarantee to return some non-empty subset | 318 | * One way or another, we guarantee to return some non-empty subset |
| 313 | * of cpu_online_mask. | 319 | * of cpu_online_mask. |
| 314 | * | 320 | * |
| 315 | * Call with callback_mutex held. | 321 | * Call with callback_mutex held. |
| 316 | */ | 322 | */ |
| 317 | |||
| 318 | static void guarantee_online_cpus(const struct cpuset *cs, | 323 | static void guarantee_online_cpus(const struct cpuset *cs, |
| 319 | struct cpumask *pmask) | 324 | struct cpumask *pmask) |
| 320 | { | 325 | { |
| 321 | while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 326 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
| 322 | cs = parent_cs(cs); | 327 | cs = parent_cs(cs); |
| 323 | if (cs) | 328 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); |
| 324 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); | ||
| 325 | else | ||
| 326 | cpumask_copy(pmask, cpu_online_mask); | ||
| 327 | BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); | ||
| 328 | } | 329 | } |
| 329 | 330 | ||
| 330 | /* | 331 | /* |
| 331 | * Return in *pmask the portion of a cpusets's mems_allowed that | 332 | * Return in *pmask the portion of a cpusets's mems_allowed that |
| 332 | * are online, with memory. If none are online with memory, walk | 333 | * are online, with memory. If none are online with memory, walk |
| 333 | * up the cpuset hierarchy until we find one that does have some | 334 | * up the cpuset hierarchy until we find one that does have some |
| 334 | * online mems. If we get all the way to the top and still haven't | 335 | * online mems. The top cpuset always has some mems online. |
| 335 | * found any online mems, return node_states[N_MEMORY]. | ||
| 336 | * | 336 | * |
| 337 | * One way or another, we guarantee to return some non-empty subset | 337 | * One way or another, we guarantee to return some non-empty subset |
| 338 | * of node_states[N_MEMORY]. | 338 | * of node_states[N_MEMORY]. |
| 339 | * | 339 | * |
| 340 | * Call with callback_mutex held. | 340 | * Call with callback_mutex held. |
| 341 | */ | 341 | */ |
| 342 | |||
| 343 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 342 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
| 344 | { | 343 | { |
| 345 | while (cs && !nodes_intersects(cs->mems_allowed, | 344 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) |
| 346 | node_states[N_MEMORY])) | ||
| 347 | cs = parent_cs(cs); | 345 | cs = parent_cs(cs); |
| 348 | if (cs) | 346 | nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); |
| 349 | nodes_and(*pmask, cs->mems_allowed, | ||
| 350 | node_states[N_MEMORY]); | ||
| 351 | else | ||
| 352 | *pmask = node_states[N_MEMORY]; | ||
| 353 | BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); | ||
| 354 | } | 347 | } |
| 355 | 348 | ||
| 356 | /* | 349 | /* |
| @@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
| 440 | 433 | ||
| 441 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | 434 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) |
| 442 | { | 435 | { |
| 443 | struct cgroup *cont; | 436 | struct cgroup *cgrp; |
| 444 | struct cpuset *c, *par; | 437 | struct cpuset *c, *par; |
| 445 | int ret; | 438 | int ret; |
| 446 | 439 | ||
| @@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 448 | 441 | ||
| 449 | /* Each of our child cpusets must be a subset of us */ | 442 | /* Each of our child cpusets must be a subset of us */ |
| 450 | ret = -EBUSY; | 443 | ret = -EBUSY; |
| 451 | cpuset_for_each_child(c, cont, cur) | 444 | cpuset_for_each_child(c, cgrp, cur) |
| 452 | if (!is_cpuset_subset(c, trial)) | 445 | if (!is_cpuset_subset(c, trial)) |
| 453 | goto out; | 446 | goto out; |
| 454 | 447 | ||
| @@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 469 | * overlap | 462 | * overlap |
| 470 | */ | 463 | */ |
| 471 | ret = -EINVAL; | 464 | ret = -EINVAL; |
| 472 | cpuset_for_each_child(c, cont, par) { | 465 | cpuset_for_each_child(c, cgrp, par) { |
| 473 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 466 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
| 474 | c != cur && | 467 | c != cur && |
| 475 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 468 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
| @@ -482,13 +475,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 482 | 475 | ||
| 483 | /* | 476 | /* |
| 484 | * Cpusets with tasks - existing or newly being attached - can't | 477 | * Cpusets with tasks - existing or newly being attached - can't |
| 485 | * have empty cpus_allowed or mems_allowed. | 478 | * be changed to have empty cpus_allowed or mems_allowed. |
| 486 | */ | 479 | */ |
| 487 | ret = -ENOSPC; | 480 | ret = -ENOSPC; |
| 488 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && | 481 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { |
| 489 | (cpumask_empty(trial->cpus_allowed) || | 482 | if (!cpumask_empty(cur->cpus_allowed) && |
| 490 | nodes_empty(trial->mems_allowed))) | 483 | cpumask_empty(trial->cpus_allowed)) |
| 491 | goto out; | 484 | goto out; |
| 485 | if (!nodes_empty(cur->mems_allowed) && | ||
| 486 | nodes_empty(trial->mems_allowed)) | ||
| 487 | goto out; | ||
| 488 | } | ||
| 492 | 489 | ||
| 493 | ret = 0; | 490 | ret = 0; |
| 494 | out: | 491 | out: |
| @@ -540,7 +537,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
| 540 | * This function builds a partial partition of the systems CPUs | 537 | * This function builds a partial partition of the systems CPUs |
| 541 | * A 'partial partition' is a set of non-overlapping subsets whose | 538 | * A 'partial partition' is a set of non-overlapping subsets whose |
| 542 | * union is a subset of that set. | 539 | * union is a subset of that set. |
| 543 | * The output of this function needs to be passed to kernel/sched.c | 540 | * The output of this function needs to be passed to kernel/sched/core.c |
| 544 | * partition_sched_domains() routine, which will rebuild the scheduler's | 541 | * partition_sched_domains() routine, which will rebuild the scheduler's |
| 545 | * load balancing domains (sched domains) as specified by that partial | 542 | * load balancing domains (sched domains) as specified by that partial |
| 546 | * partition. | 543 | * partition. |
| @@ -569,7 +566,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
| 569 | * is a subset of one of these domains, while there are as | 566 | * is a subset of one of these domains, while there are as |
| 570 | * many such domains as possible, each as small as possible. | 567 | * many such domains as possible, each as small as possible. |
| 571 | * doms - Conversion of 'csa' to an array of cpumasks, for passing to | 568 | * doms - Conversion of 'csa' to an array of cpumasks, for passing to |
| 572 | * the kernel/sched.c routine partition_sched_domains() in a | 569 | * the kernel/sched/core.c routine partition_sched_domains() in a |
| 573 | * convenient format, that can be easily compared to the prior | 570 | * convenient format, that can be easily compared to the prior |
| 574 | * value to determine what partition elements (sched domains) | 571 | * value to determine what partition elements (sched domains) |
| 575 | * were changed (added or removed.) | 572 | * were changed (added or removed.) |
| @@ -798,21 +795,43 @@ void rebuild_sched_domains(void) | |||
| 798 | mutex_unlock(&cpuset_mutex); | 795 | mutex_unlock(&cpuset_mutex); |
| 799 | } | 796 | } |
| 800 | 797 | ||
| 801 | /** | 798 | /* |
| 802 | * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's | 799 | * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus |
| 803 | * @tsk: task to test | 800 | * @cs: the cpuset in interest |
| 804 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | ||
| 805 | * | 801 | * |
| 806 | * Call with cpuset_mutex held. May take callback_mutex during call. | 802 | * A cpuset's effective cpumask is the cpumask of the nearest ancestor |
| 807 | * Called for each task in a cgroup by cgroup_scan_tasks(). | 803 | * with non-empty cpus. We use effective cpumask whenever: |
| 808 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | 804 | * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask |
| 809 | * words, if its mask is not equal to its cpuset's mask). | 805 | * if the cpuset they reside in has no cpus) |
| 806 | * - we want to retrieve task_cs(tsk)'s cpus_allowed. | ||
| 807 | * | ||
| 808 | * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an | ||
| 809 | * exception. See comments there. | ||
| 810 | */ | 810 | */ |
| 811 | static int cpuset_test_cpumask(struct task_struct *tsk, | 811 | static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) |
| 812 | struct cgroup_scanner *scan) | ||
| 813 | { | 812 | { |
| 814 | return !cpumask_equal(&tsk->cpus_allowed, | 813 | while (cpumask_empty(cs->cpus_allowed)) |
| 815 | (cgroup_cs(scan->cg))->cpus_allowed); | 814 | cs = parent_cs(cs); |
| 815 | return cs; | ||
| 816 | } | ||
| 817 | |||
| 818 | /* | ||
| 819 | * effective_nodemask_cpuset - return nearest ancestor with non-empty mems | ||
| 820 | * @cs: the cpuset in interest | ||
| 821 | * | ||
| 822 | * A cpuset's effective nodemask is the nodemask of the nearest ancestor | ||
| 823 | * with non-empty memss. We use effective nodemask whenever: | ||
| 824 | * - we update tasks' mems_allowed. (they take on the ancestor's nodemask | ||
| 825 | * if the cpuset they reside in has no mems) | ||
| 826 | * - we want to retrieve task_cs(tsk)'s mems_allowed. | ||
| 827 | * | ||
| 828 | * Called with cpuset_mutex held. | ||
| 829 | */ | ||
| 830 | static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | ||
| 831 | { | ||
| 832 | while (nodes_empty(cs->mems_allowed)) | ||
| 833 | cs = parent_cs(cs); | ||
| 834 | return cs; | ||
| 816 | } | 835 | } |
| 817 | 836 | ||
| 818 | /** | 837 | /** |
| @@ -829,7 +848,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk, | |||
| 829 | static void cpuset_change_cpumask(struct task_struct *tsk, | 848 | static void cpuset_change_cpumask(struct task_struct *tsk, |
| 830 | struct cgroup_scanner *scan) | 849 | struct cgroup_scanner *scan) |
| 831 | { | 850 | { |
| 832 | set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); | 851 | struct cpuset *cpus_cs; |
| 852 | |||
| 853 | cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); | ||
| 854 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); | ||
| 833 | } | 855 | } |
| 834 | 856 | ||
| 835 | /** | 857 | /** |
| @@ -850,12 +872,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | |||
| 850 | struct cgroup_scanner scan; | 872 | struct cgroup_scanner scan; |
| 851 | 873 | ||
| 852 | scan.cg = cs->css.cgroup; | 874 | scan.cg = cs->css.cgroup; |
| 853 | scan.test_task = cpuset_test_cpumask; | 875 | scan.test_task = NULL; |
| 854 | scan.process_task = cpuset_change_cpumask; | 876 | scan.process_task = cpuset_change_cpumask; |
| 855 | scan.heap = heap; | 877 | scan.heap = heap; |
| 856 | cgroup_scan_tasks(&scan); | 878 | cgroup_scan_tasks(&scan); |
| 857 | } | 879 | } |
| 858 | 880 | ||
| 881 | /* | ||
| 882 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | ||
| 883 | * @root_cs: the root cpuset of the hierarchy | ||
| 884 | * @update_root: update root cpuset or not? | ||
| 885 | * @heap: the heap used by cgroup_scan_tasks() | ||
| 886 | * | ||
| 887 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | ||
| 888 | * which take on cpumask of @root_cs. | ||
| 889 | * | ||
| 890 | * Called with cpuset_mutex held | ||
| 891 | */ | ||
| 892 | static void update_tasks_cpumask_hier(struct cpuset *root_cs, | ||
| 893 | bool update_root, struct ptr_heap *heap) | ||
| 894 | { | ||
| 895 | struct cpuset *cp; | ||
| 896 | struct cgroup *pos_cgrp; | ||
| 897 | |||
| 898 | if (update_root) | ||
| 899 | update_tasks_cpumask(root_cs, heap); | ||
| 900 | |||
| 901 | rcu_read_lock(); | ||
| 902 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
| 903 | /* skip the whole subtree if @cp have some CPU */ | ||
| 904 | if (!cpumask_empty(cp->cpus_allowed)) { | ||
| 905 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
| 906 | continue; | ||
| 907 | } | ||
| 908 | if (!css_tryget(&cp->css)) | ||
| 909 | continue; | ||
| 910 | rcu_read_unlock(); | ||
| 911 | |||
| 912 | update_tasks_cpumask(cp, heap); | ||
| 913 | |||
| 914 | rcu_read_lock(); | ||
| 915 | css_put(&cp->css); | ||
| 916 | } | ||
| 917 | rcu_read_unlock(); | ||
| 918 | } | ||
| 919 | |||
| 859 | /** | 920 | /** |
| 860 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 921 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
| 861 | * @cs: the cpuset to consider | 922 | * @cs: the cpuset to consider |
| @@ -888,14 +949,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 888 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) | 949 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) |
| 889 | return -EINVAL; | 950 | return -EINVAL; |
| 890 | } | 951 | } |
| 891 | retval = validate_change(cs, trialcs); | ||
| 892 | if (retval < 0) | ||
| 893 | return retval; | ||
| 894 | 952 | ||
| 895 | /* Nothing to do if the cpus didn't change */ | 953 | /* Nothing to do if the cpus didn't change */ |
| 896 | if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) | 954 | if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) |
| 897 | return 0; | 955 | return 0; |
| 898 | 956 | ||
| 957 | retval = validate_change(cs, trialcs); | ||
| 958 | if (retval < 0) | ||
| 959 | return retval; | ||
| 960 | |||
| 899 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | 961 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); |
| 900 | if (retval) | 962 | if (retval) |
| 901 | return retval; | 963 | return retval; |
| @@ -906,11 +968,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 906 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | 968 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
| 907 | mutex_unlock(&callback_mutex); | 969 | mutex_unlock(&callback_mutex); |
| 908 | 970 | ||
| 909 | /* | 971 | update_tasks_cpumask_hier(cs, true, &heap); |
| 910 | * Scan tasks in the cpuset, and update the cpumasks of any | ||
| 911 | * that need an update. | ||
| 912 | */ | ||
| 913 | update_tasks_cpumask(cs, &heap); | ||
| 914 | 972 | ||
| 915 | heap_free(&heap); | 973 | heap_free(&heap); |
| 916 | 974 | ||
| @@ -943,12 +1001,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
| 943 | const nodemask_t *to) | 1001 | const nodemask_t *to) |
| 944 | { | 1002 | { |
| 945 | struct task_struct *tsk = current; | 1003 | struct task_struct *tsk = current; |
| 1004 | struct cpuset *mems_cs; | ||
| 946 | 1005 | ||
| 947 | tsk->mems_allowed = *to; | 1006 | tsk->mems_allowed = *to; |
| 948 | 1007 | ||
| 949 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 1008 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
| 950 | 1009 | ||
| 951 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); | 1010 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); |
| 1011 | guarantee_online_mems(mems_cs, &tsk->mems_allowed); | ||
| 952 | } | 1012 | } |
| 953 | 1013 | ||
| 954 | /* | 1014 | /* |
| @@ -1007,16 +1067,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
| 1007 | static void cpuset_change_nodemask(struct task_struct *p, | 1067 | static void cpuset_change_nodemask(struct task_struct *p, |
| 1008 | struct cgroup_scanner *scan) | 1068 | struct cgroup_scanner *scan) |
| 1009 | { | 1069 | { |
| 1070 | struct cpuset *cs = cgroup_cs(scan->cg); | ||
| 1010 | struct mm_struct *mm; | 1071 | struct mm_struct *mm; |
| 1011 | struct cpuset *cs; | ||
| 1012 | int migrate; | 1072 | int migrate; |
| 1013 | const nodemask_t *oldmem = scan->data; | 1073 | nodemask_t *newmems = scan->data; |
| 1014 | static nodemask_t newmems; /* protected by cpuset_mutex */ | ||
| 1015 | 1074 | ||
| 1016 | cs = cgroup_cs(scan->cg); | 1075 | cpuset_change_task_nodemask(p, newmems); |
| 1017 | guarantee_online_mems(cs, &newmems); | ||
| 1018 | |||
| 1019 | cpuset_change_task_nodemask(p, &newmems); | ||
| 1020 | 1076 | ||
| 1021 | mm = get_task_mm(p); | 1077 | mm = get_task_mm(p); |
| 1022 | if (!mm) | 1078 | if (!mm) |
| @@ -1026,7 +1082,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
| 1026 | 1082 | ||
| 1027 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1083 | mpol_rebind_mm(mm, &cs->mems_allowed); |
| 1028 | if (migrate) | 1084 | if (migrate) |
| 1029 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); | 1085 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); |
| 1030 | mmput(mm); | 1086 | mmput(mm); |
| 1031 | } | 1087 | } |
| 1032 | 1088 | ||
| @@ -1035,25 +1091,27 @@ static void *cpuset_being_rebound; | |||
| 1035 | /** | 1091 | /** |
| 1036 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 1092 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
| 1037 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 1093 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
| 1038 | * @oldmem: old mems_allowed of cpuset cs | ||
| 1039 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1094 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
| 1040 | * | 1095 | * |
| 1041 | * Called with cpuset_mutex held | 1096 | * Called with cpuset_mutex held |
| 1042 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1097 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
| 1043 | * if @heap != NULL. | 1098 | * if @heap != NULL. |
| 1044 | */ | 1099 | */ |
| 1045 | static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | 1100 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) |
| 1046 | struct ptr_heap *heap) | ||
| 1047 | { | 1101 | { |
| 1102 | static nodemask_t newmems; /* protected by cpuset_mutex */ | ||
| 1048 | struct cgroup_scanner scan; | 1103 | struct cgroup_scanner scan; |
| 1104 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | ||
| 1049 | 1105 | ||
| 1050 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1106 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
| 1051 | 1107 | ||
| 1108 | guarantee_online_mems(mems_cs, &newmems); | ||
| 1109 | |||
| 1052 | scan.cg = cs->css.cgroup; | 1110 | scan.cg = cs->css.cgroup; |
| 1053 | scan.test_task = NULL; | 1111 | scan.test_task = NULL; |
| 1054 | scan.process_task = cpuset_change_nodemask; | 1112 | scan.process_task = cpuset_change_nodemask; |
| 1055 | scan.heap = heap; | 1113 | scan.heap = heap; |
| 1056 | scan.data = (nodemask_t *)oldmem; | 1114 | scan.data = &newmems; |
| 1057 | 1115 | ||
| 1058 | /* | 1116 | /* |
| 1059 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't | 1117 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
| @@ -1067,11 +1125,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
| 1067 | */ | 1125 | */ |
| 1068 | cgroup_scan_tasks(&scan); | 1126 | cgroup_scan_tasks(&scan); |
| 1069 | 1127 | ||
| 1128 | /* | ||
| 1129 | * All the tasks' nodemasks have been updated, update | ||
| 1130 | * cs->old_mems_allowed. | ||
| 1131 | */ | ||
| 1132 | cs->old_mems_allowed = newmems; | ||
| 1133 | |||
| 1070 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ | 1134 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
| 1071 | cpuset_being_rebound = NULL; | 1135 | cpuset_being_rebound = NULL; |
| 1072 | } | 1136 | } |
| 1073 | 1137 | ||
| 1074 | /* | 1138 | /* |
| 1139 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | ||
| 1140 | * @cs: the root cpuset of the hierarchy | ||
| 1141 | * @update_root: update the root cpuset or not? | ||
| 1142 | * @heap: the heap used by cgroup_scan_tasks() | ||
| 1143 | * | ||
| 1144 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | ||
| 1145 | * which take on nodemask of @root_cs. | ||
| 1146 | * | ||
| 1147 | * Called with cpuset_mutex held | ||
| 1148 | */ | ||
| 1149 | static void update_tasks_nodemask_hier(struct cpuset *root_cs, | ||
| 1150 | bool update_root, struct ptr_heap *heap) | ||
| 1151 | { | ||
| 1152 | struct cpuset *cp; | ||
| 1153 | struct cgroup *pos_cgrp; | ||
| 1154 | |||
| 1155 | if (update_root) | ||
| 1156 | update_tasks_nodemask(root_cs, heap); | ||
| 1157 | |||
| 1158 | rcu_read_lock(); | ||
| 1159 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
| 1160 | /* skip the whole subtree if @cp have some CPU */ | ||
| 1161 | if (!nodes_empty(cp->mems_allowed)) { | ||
| 1162 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
| 1163 | continue; | ||
| 1164 | } | ||
| 1165 | if (!css_tryget(&cp->css)) | ||
| 1166 | continue; | ||
| 1167 | rcu_read_unlock(); | ||
| 1168 | |||
| 1169 | update_tasks_nodemask(cp, heap); | ||
| 1170 | |||
| 1171 | rcu_read_lock(); | ||
| 1172 | css_put(&cp->css); | ||
| 1173 | } | ||
| 1174 | rcu_read_unlock(); | ||
| 1175 | } | ||
| 1176 | |||
| 1177 | /* | ||
| 1075 | * Handle user request to change the 'mems' memory placement | 1178 | * Handle user request to change the 'mems' memory placement |
| 1076 | * of a cpuset. Needs to validate the request, update the | 1179 | * of a cpuset. Needs to validate the request, update the |
| 1077 | * cpusets mems_allowed, and for each task in the cpuset, | 1180 | * cpusets mems_allowed, and for each task in the cpuset, |
| @@ -1087,13 +1190,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
| 1087 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | 1190 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
| 1088 | const char *buf) | 1191 | const char *buf) |
| 1089 | { | 1192 | { |
| 1090 | NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); | ||
| 1091 | int retval; | 1193 | int retval; |
| 1092 | struct ptr_heap heap; | 1194 | struct ptr_heap heap; |
| 1093 | 1195 | ||
| 1094 | if (!oldmem) | ||
| 1095 | return -ENOMEM; | ||
| 1096 | |||
| 1097 | /* | 1196 | /* |
| 1098 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; | 1197 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
| 1099 | * it's read-only | 1198 | * it's read-only |
| @@ -1122,8 +1221,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 1122 | goto done; | 1221 | goto done; |
| 1123 | } | 1222 | } |
| 1124 | } | 1223 | } |
| 1125 | *oldmem = cs->mems_allowed; | 1224 | |
| 1126 | if (nodes_equal(*oldmem, trialcs->mems_allowed)) { | 1225 | if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { |
| 1127 | retval = 0; /* Too easy - nothing to do */ | 1226 | retval = 0; /* Too easy - nothing to do */ |
| 1128 | goto done; | 1227 | goto done; |
| 1129 | } | 1228 | } |
| @@ -1139,11 +1238,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 1139 | cs->mems_allowed = trialcs->mems_allowed; | 1238 | cs->mems_allowed = trialcs->mems_allowed; |
| 1140 | mutex_unlock(&callback_mutex); | 1239 | mutex_unlock(&callback_mutex); |
| 1141 | 1240 | ||
| 1142 | update_tasks_nodemask(cs, oldmem, &heap); | 1241 | update_tasks_nodemask_hier(cs, true, &heap); |
| 1143 | 1242 | ||
| 1144 | heap_free(&heap); | 1243 | heap_free(&heap); |
| 1145 | done: | 1244 | done: |
| 1146 | NODEMASK_FREE(oldmem); | ||
| 1147 | return retval; | 1245 | return retval; |
| 1148 | } | 1246 | } |
| 1149 | 1247 | ||
| @@ -1372,8 +1470,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1372 | 1470 | ||
| 1373 | mutex_lock(&cpuset_mutex); | 1471 | mutex_lock(&cpuset_mutex); |
| 1374 | 1472 | ||
| 1473 | /* | ||
| 1474 | * We allow to move tasks into an empty cpuset if sane_behavior | ||
| 1475 | * flag is set. | ||
| 1476 | */ | ||
| 1375 | ret = -ENOSPC; | 1477 | ret = -ENOSPC; |
| 1376 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1478 | if (!cgroup_sane_behavior(cgrp) && |
| 1479 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | ||
| 1377 | goto out_unlock; | 1480 | goto out_unlock; |
| 1378 | 1481 | ||
| 1379 | cgroup_taskset_for_each(task, cgrp, tset) { | 1482 | cgroup_taskset_for_each(task, cgrp, tset) { |
| @@ -1422,8 +1525,7 @@ static cpumask_var_t cpus_attach; | |||
| 1422 | 1525 | ||
| 1423 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1526 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
| 1424 | { | 1527 | { |
| 1425 | /* static bufs protected by cpuset_mutex */ | 1528 | /* static buf protected by cpuset_mutex */ |
| 1426 | static nodemask_t cpuset_attach_nodemask_from; | ||
| 1427 | static nodemask_t cpuset_attach_nodemask_to; | 1529 | static nodemask_t cpuset_attach_nodemask_to; |
| 1428 | struct mm_struct *mm; | 1530 | struct mm_struct *mm; |
| 1429 | struct task_struct *task; | 1531 | struct task_struct *task; |
| @@ -1431,6 +1533,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1431 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | 1533 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); |
| 1432 | struct cpuset *cs = cgroup_cs(cgrp); | 1534 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1433 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1535 | struct cpuset *oldcs = cgroup_cs(oldcgrp); |
| 1536 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
| 1537 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | ||
| 1434 | 1538 | ||
| 1435 | mutex_lock(&cpuset_mutex); | 1539 | mutex_lock(&cpuset_mutex); |
| 1436 | 1540 | ||
| @@ -1438,9 +1542,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1438 | if (cs == &top_cpuset) | 1542 | if (cs == &top_cpuset) |
| 1439 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1543 | cpumask_copy(cpus_attach, cpu_possible_mask); |
| 1440 | else | 1544 | else |
| 1441 | guarantee_online_cpus(cs, cpus_attach); | 1545 | guarantee_online_cpus(cpus_cs, cpus_attach); |
| 1442 | 1546 | ||
| 1443 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | 1547 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); |
| 1444 | 1548 | ||
| 1445 | cgroup_taskset_for_each(task, cgrp, tset) { | 1549 | cgroup_taskset_for_each(task, cgrp, tset) { |
| 1446 | /* | 1550 | /* |
| @@ -1457,26 +1561,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1457 | * Change mm, possibly for multiple threads in a threadgroup. This is | 1561 | * Change mm, possibly for multiple threads in a threadgroup. This is |
| 1458 | * expensive and may sleep. | 1562 | * expensive and may sleep. |
| 1459 | */ | 1563 | */ |
| 1460 | cpuset_attach_nodemask_from = oldcs->mems_allowed; | ||
| 1461 | cpuset_attach_nodemask_to = cs->mems_allowed; | 1564 | cpuset_attach_nodemask_to = cs->mems_allowed; |
| 1462 | mm = get_task_mm(leader); | 1565 | mm = get_task_mm(leader); |
| 1463 | if (mm) { | 1566 | if (mm) { |
| 1567 | struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); | ||
| 1568 | |||
| 1464 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); | 1569 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
| 1465 | if (is_memory_migrate(cs)) | 1570 | |
| 1466 | cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, | 1571 | /* |
| 1572 | * old_mems_allowed is the same with mems_allowed here, except | ||
| 1573 | * if this task is being moved automatically due to hotplug. | ||
| 1574 | * In that case @mems_allowed has been updated and is empty, | ||
| 1575 | * so @old_mems_allowed is the right nodesets that we migrate | ||
| 1576 | * mm from. | ||
| 1577 | */ | ||
| 1578 | if (is_memory_migrate(cs)) { | ||
| 1579 | cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, | ||
| 1467 | &cpuset_attach_nodemask_to); | 1580 | &cpuset_attach_nodemask_to); |
| 1581 | } | ||
| 1468 | mmput(mm); | 1582 | mmput(mm); |
| 1469 | } | 1583 | } |
| 1470 | 1584 | ||
| 1471 | cs->attach_in_progress--; | 1585 | cs->old_mems_allowed = cpuset_attach_nodemask_to; |
| 1472 | 1586 | ||
| 1473 | /* | 1587 | cs->attach_in_progress--; |
| 1474 | * We may have raced with CPU/memory hotunplug. Trigger hotplug | 1588 | if (!cs->attach_in_progress) |
| 1475 | * propagation if @cs doesn't have any CPU or memory. It will move | 1589 | wake_up(&cpuset_attach_wq); |
| 1476 | * the newly added tasks to the nearest parent which can execute. | ||
| 1477 | */ | ||
| 1478 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
| 1479 | schedule_cpuset_propagate_hotplug(cs); | ||
| 1480 | 1590 | ||
| 1481 | mutex_unlock(&cpuset_mutex); | 1591 | mutex_unlock(&cpuset_mutex); |
| 1482 | } | 1592 | } |
| @@ -1502,11 +1612,13 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | |||
| 1502 | { | 1612 | { |
| 1503 | struct cpuset *cs = cgroup_cs(cgrp); | 1613 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1504 | cpuset_filetype_t type = cft->private; | 1614 | cpuset_filetype_t type = cft->private; |
| 1505 | int retval = -ENODEV; | 1615 | int retval = 0; |
| 1506 | 1616 | ||
| 1507 | mutex_lock(&cpuset_mutex); | 1617 | mutex_lock(&cpuset_mutex); |
| 1508 | if (!is_cpuset_online(cs)) | 1618 | if (!is_cpuset_online(cs)) { |
| 1619 | retval = -ENODEV; | ||
| 1509 | goto out_unlock; | 1620 | goto out_unlock; |
| 1621 | } | ||
| 1510 | 1622 | ||
| 1511 | switch (type) { | 1623 | switch (type) { |
| 1512 | case FILE_CPU_EXCLUSIVE: | 1624 | case FILE_CPU_EXCLUSIVE: |
| @@ -1588,13 +1700,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
| 1588 | * resources, wait for the previously scheduled operations before | 1700 | * resources, wait for the previously scheduled operations before |
| 1589 | * proceeding, so that we don't end up keep removing tasks added | 1701 | * proceeding, so that we don't end up keep removing tasks added |
| 1590 | * after execution capability is restored. | 1702 | * after execution capability is restored. |
| 1591 | * | ||
| 1592 | * Flushing cpuset_hotplug_work is enough to synchronize against | ||
| 1593 | * hotplug hanlding; however, cpuset_attach() may schedule | ||
| 1594 | * propagation work directly. Flush the workqueue too. | ||
| 1595 | */ | 1703 | */ |
| 1596 | flush_work(&cpuset_hotplug_work); | 1704 | flush_work(&cpuset_hotplug_work); |
| 1597 | flush_workqueue(cpuset_propagate_hotplug_wq); | ||
| 1598 | 1705 | ||
| 1599 | mutex_lock(&cpuset_mutex); | 1706 | mutex_lock(&cpuset_mutex); |
| 1600 | if (!is_cpuset_online(cs)) | 1707 | if (!is_cpuset_online(cs)) |
| @@ -1658,13 +1765,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 1658 | return count; | 1765 | return count; |
| 1659 | } | 1766 | } |
| 1660 | 1767 | ||
| 1661 | static ssize_t cpuset_common_file_read(struct cgroup *cont, | 1768 | static ssize_t cpuset_common_file_read(struct cgroup *cgrp, |
| 1662 | struct cftype *cft, | 1769 | struct cftype *cft, |
| 1663 | struct file *file, | 1770 | struct file *file, |
| 1664 | char __user *buf, | 1771 | char __user *buf, |
| 1665 | size_t nbytes, loff_t *ppos) | 1772 | size_t nbytes, loff_t *ppos) |
| 1666 | { | 1773 | { |
| 1667 | struct cpuset *cs = cgroup_cs(cont); | 1774 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1668 | cpuset_filetype_t type = cft->private; | 1775 | cpuset_filetype_t type = cft->private; |
| 1669 | char *page; | 1776 | char *page; |
| 1670 | ssize_t retval = 0; | 1777 | ssize_t retval = 0; |
| @@ -1694,9 +1801,9 @@ out: | |||
| 1694 | return retval; | 1801 | return retval; |
| 1695 | } | 1802 | } |
| 1696 | 1803 | ||
| 1697 | static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | 1804 | static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) |
| 1698 | { | 1805 | { |
| 1699 | struct cpuset *cs = cgroup_cs(cont); | 1806 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1700 | cpuset_filetype_t type = cft->private; | 1807 | cpuset_filetype_t type = cft->private; |
| 1701 | switch (type) { | 1808 | switch (type) { |
| 1702 | case FILE_CPU_EXCLUSIVE: | 1809 | case FILE_CPU_EXCLUSIVE: |
| @@ -1725,9 +1832,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
| 1725 | return 0; | 1832 | return 0; |
| 1726 | } | 1833 | } |
| 1727 | 1834 | ||
| 1728 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1835 | static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) |
| 1729 | { | 1836 | { |
| 1730 | struct cpuset *cs = cgroup_cs(cont); | 1837 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1731 | cpuset_filetype_t type = cft->private; | 1838 | cpuset_filetype_t type = cft->private; |
| 1732 | switch (type) { | 1839 | switch (type) { |
| 1733 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1840 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
| @@ -1839,14 +1946,14 @@ static struct cftype files[] = { | |||
| 1839 | 1946 | ||
| 1840 | /* | 1947 | /* |
| 1841 | * cpuset_css_alloc - allocate a cpuset css | 1948 | * cpuset_css_alloc - allocate a cpuset css |
| 1842 | * cont: control group that the new cpuset will be part of | 1949 | * cgrp: control group that the new cpuset will be part of |
| 1843 | */ | 1950 | */ |
| 1844 | 1951 | ||
| 1845 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | 1952 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) |
| 1846 | { | 1953 | { |
| 1847 | struct cpuset *cs; | 1954 | struct cpuset *cs; |
| 1848 | 1955 | ||
| 1849 | if (!cont->parent) | 1956 | if (!cgrp->parent) |
| 1850 | return &top_cpuset.css; | 1957 | return &top_cpuset.css; |
| 1851 | 1958 | ||
| 1852 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | 1959 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
| @@ -1861,7 +1968,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | |||
| 1861 | cpumask_clear(cs->cpus_allowed); | 1968 | cpumask_clear(cs->cpus_allowed); |
| 1862 | nodes_clear(cs->mems_allowed); | 1969 | nodes_clear(cs->mems_allowed); |
| 1863 | fmeter_init(&cs->fmeter); | 1970 | fmeter_init(&cs->fmeter); |
| 1864 | INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); | ||
| 1865 | cs->relax_domain_level = -1; | 1971 | cs->relax_domain_level = -1; |
| 1866 | 1972 | ||
| 1867 | return &cs->css; | 1973 | return &cs->css; |
| @@ -1942,9 +2048,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
| 1942 | * will call rebuild_sched_domains_locked(). | 2048 | * will call rebuild_sched_domains_locked(). |
| 1943 | */ | 2049 | */ |
| 1944 | 2050 | ||
| 1945 | static void cpuset_css_free(struct cgroup *cont) | 2051 | static void cpuset_css_free(struct cgroup *cgrp) |
| 1946 | { | 2052 | { |
| 1947 | struct cpuset *cs = cgroup_cs(cont); | 2053 | struct cpuset *cs = cgroup_cs(cgrp); |
| 1948 | 2054 | ||
| 1949 | free_cpumask_var(cs->cpus_allowed); | 2055 | free_cpumask_var(cs->cpus_allowed); |
| 1950 | kfree(cs); | 2056 | kfree(cs); |
| @@ -2024,41 +2130,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
| 2024 | } | 2130 | } |
| 2025 | 2131 | ||
| 2026 | /** | 2132 | /** |
| 2027 | * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset | 2133 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug |
| 2028 | * @cs: cpuset in interest | 2134 | * @cs: cpuset in interest |
| 2029 | * | 2135 | * |
| 2030 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone | 2136 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone |
| 2031 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, | 2137 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, |
| 2032 | * all its tasks are moved to the nearest ancestor with both resources. | 2138 | * all its tasks are moved to the nearest ancestor with both resources. |
| 2033 | */ | 2139 | */ |
| 2034 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work) | 2140 | static void cpuset_hotplug_update_tasks(struct cpuset *cs) |
| 2035 | { | 2141 | { |
| 2036 | static cpumask_t off_cpus; | 2142 | static cpumask_t off_cpus; |
| 2037 | static nodemask_t off_mems, tmp_mems; | 2143 | static nodemask_t off_mems; |
| 2038 | struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); | ||
| 2039 | bool is_empty; | 2144 | bool is_empty; |
| 2145 | bool sane = cgroup_sane_behavior(cs->css.cgroup); | ||
| 2146 | |||
| 2147 | retry: | ||
| 2148 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); | ||
| 2040 | 2149 | ||
| 2041 | mutex_lock(&cpuset_mutex); | 2150 | mutex_lock(&cpuset_mutex); |
| 2042 | 2151 | ||
| 2152 | /* | ||
| 2153 | * We have raced with task attaching. We wait until attaching | ||
| 2154 | * is finished, so we won't attach a task to an empty cpuset. | ||
| 2155 | */ | ||
| 2156 | if (cs->attach_in_progress) { | ||
| 2157 | mutex_unlock(&cpuset_mutex); | ||
| 2158 | goto retry; | ||
| 2159 | } | ||
| 2160 | |||
| 2043 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); | 2161 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); |
| 2044 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); | 2162 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); |
| 2045 | 2163 | ||
| 2046 | /* remove offline cpus from @cs */ | 2164 | mutex_lock(&callback_mutex); |
| 2047 | if (!cpumask_empty(&off_cpus)) { | 2165 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); |
| 2048 | mutex_lock(&callback_mutex); | 2166 | mutex_unlock(&callback_mutex); |
| 2049 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); | 2167 | |
| 2050 | mutex_unlock(&callback_mutex); | 2168 | /* |
| 2169 | * If sane_behavior flag is set, we need to update tasks' cpumask | ||
| 2170 | * for empty cpuset to take on ancestor's cpumask. Otherwise, don't | ||
| 2171 | * call update_tasks_cpumask() if the cpuset becomes empty, as | ||
| 2172 | * the tasks in it will be migrated to an ancestor. | ||
| 2173 | */ | ||
| 2174 | if ((sane && cpumask_empty(cs->cpus_allowed)) || | ||
| 2175 | (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) | ||
| 2051 | update_tasks_cpumask(cs, NULL); | 2176 | update_tasks_cpumask(cs, NULL); |
| 2052 | } | ||
| 2053 | 2177 | ||
| 2054 | /* remove offline mems from @cs */ | 2178 | mutex_lock(&callback_mutex); |
| 2055 | if (!nodes_empty(off_mems)) { | 2179 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); |
| 2056 | tmp_mems = cs->mems_allowed; | 2180 | mutex_unlock(&callback_mutex); |
| 2057 | mutex_lock(&callback_mutex); | 2181 | |
| 2058 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); | 2182 | /* |
| 2059 | mutex_unlock(&callback_mutex); | 2183 | * If sane_behavior flag is set, we need to update tasks' nodemask |
| 2060 | update_tasks_nodemask(cs, &tmp_mems, NULL); | 2184 | * for empty cpuset to take on ancestor's nodemask. Otherwise, don't |
| 2061 | } | 2185 | * call update_tasks_nodemask() if the cpuset becomes empty, as |
| 2186 | * the tasks in it will be migratd to an ancestor. | ||
| 2187 | */ | ||
| 2188 | if ((sane && nodes_empty(cs->mems_allowed)) || | ||
| 2189 | (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) | ||
| 2190 | update_tasks_nodemask(cs, NULL); | ||
| 2062 | 2191 | ||
| 2063 | is_empty = cpumask_empty(cs->cpus_allowed) || | 2192 | is_empty = cpumask_empty(cs->cpus_allowed) || |
| 2064 | nodes_empty(cs->mems_allowed); | 2193 | nodes_empty(cs->mems_allowed); |
| @@ -2066,40 +2195,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) | |||
| 2066 | mutex_unlock(&cpuset_mutex); | 2195 | mutex_unlock(&cpuset_mutex); |
| 2067 | 2196 | ||
| 2068 | /* | 2197 | /* |
| 2069 | * If @cs became empty, move tasks to the nearest ancestor with | 2198 | * If sane_behavior flag is set, we'll keep tasks in empty cpusets. |
| 2070 | * execution resources. This is full cgroup operation which will | 2199 | * |
| 2200 | * Otherwise move tasks to the nearest ancestor with execution | ||
| 2201 | * resources. This is full cgroup operation which will | ||
| 2071 | * also call back into cpuset. Should be done outside any lock. | 2202 | * also call back into cpuset. Should be done outside any lock. |
| 2072 | */ | 2203 | */ |
| 2073 | if (is_empty) | 2204 | if (!sane && is_empty) |
| 2074 | remove_tasks_in_empty_cpuset(cs); | 2205 | remove_tasks_in_empty_cpuset(cs); |
| 2075 | |||
| 2076 | /* the following may free @cs, should be the last operation */ | ||
| 2077 | css_put(&cs->css); | ||
| 2078 | } | ||
| 2079 | |||
| 2080 | /** | ||
| 2081 | * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset | ||
| 2082 | * @cs: cpuset of interest | ||
| 2083 | * | ||
| 2084 | * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and | ||
| 2085 | * memory masks according to top_cpuset. | ||
| 2086 | */ | ||
| 2087 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) | ||
| 2088 | { | ||
| 2089 | /* | ||
| 2090 | * Pin @cs. The refcnt will be released when the work item | ||
| 2091 | * finishes executing. | ||
| 2092 | */ | ||
| 2093 | if (!css_tryget(&cs->css)) | ||
| 2094 | return; | ||
| 2095 | |||
| 2096 | /* | ||
| 2097 | * Queue @cs->hotplug_work. If already pending, lose the css ref. | ||
| 2098 | * cpuset_propagate_hotplug_wq is ordered and propagation will | ||
| 2099 | * happen in the order this function is called. | ||
| 2100 | */ | ||
| 2101 | if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) | ||
| 2102 | css_put(&cs->css); | ||
| 2103 | } | 2206 | } |
| 2104 | 2207 | ||
| 2105 | /** | 2208 | /** |
| @@ -2112,18 +2215,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) | |||
| 2112 | * actively using CPU hotplug but making no active use of cpusets. | 2215 | * actively using CPU hotplug but making no active use of cpusets. |
| 2113 | * | 2216 | * |
| 2114 | * Non-root cpusets are only affected by offlining. If any CPUs or memory | 2217 | * Non-root cpusets are only affected by offlining. If any CPUs or memory |
| 2115 | * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all | 2218 | * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on |
| 2116 | * descendants. | 2219 | * all descendants. |
| 2117 | * | 2220 | * |
| 2118 | * Note that CPU offlining during suspend is ignored. We don't modify | 2221 | * Note that CPU offlining during suspend is ignored. We don't modify |
| 2119 | * cpusets across suspend/resume cycles at all. | 2222 | * cpusets across suspend/resume cycles at all. |
| 2120 | */ | 2223 | */ |
| 2121 | static void cpuset_hotplug_workfn(struct work_struct *work) | 2224 | static void cpuset_hotplug_workfn(struct work_struct *work) |
| 2122 | { | 2225 | { |
| 2123 | static cpumask_t new_cpus, tmp_cpus; | 2226 | static cpumask_t new_cpus; |
| 2124 | static nodemask_t new_mems, tmp_mems; | 2227 | static nodemask_t new_mems; |
| 2125 | bool cpus_updated, mems_updated; | 2228 | bool cpus_updated, mems_updated; |
| 2126 | bool cpus_offlined, mems_offlined; | ||
| 2127 | 2229 | ||
| 2128 | mutex_lock(&cpuset_mutex); | 2230 | mutex_lock(&cpuset_mutex); |
| 2129 | 2231 | ||
| @@ -2132,12 +2234,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
| 2132 | new_mems = node_states[N_MEMORY]; | 2234 | new_mems = node_states[N_MEMORY]; |
| 2133 | 2235 | ||
| 2134 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); | 2236 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); |
| 2135 | cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, | ||
| 2136 | &new_cpus); | ||
| 2137 | |||
| 2138 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); | 2237 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); |
| 2139 | nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); | ||
| 2140 | mems_offlined = !nodes_empty(tmp_mems); | ||
| 2141 | 2238 | ||
| 2142 | /* synchronize cpus_allowed to cpu_active_mask */ | 2239 | /* synchronize cpus_allowed to cpu_active_mask */ |
| 2143 | if (cpus_updated) { | 2240 | if (cpus_updated) { |
| @@ -2149,28 +2246,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
| 2149 | 2246 | ||
| 2150 | /* synchronize mems_allowed to N_MEMORY */ | 2247 | /* synchronize mems_allowed to N_MEMORY */ |
| 2151 | if (mems_updated) { | 2248 | if (mems_updated) { |
| 2152 | tmp_mems = top_cpuset.mems_allowed; | ||
| 2153 | mutex_lock(&callback_mutex); | 2249 | mutex_lock(&callback_mutex); |
| 2154 | top_cpuset.mems_allowed = new_mems; | 2250 | top_cpuset.mems_allowed = new_mems; |
| 2155 | mutex_unlock(&callback_mutex); | 2251 | mutex_unlock(&callback_mutex); |
| 2156 | update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); | 2252 | update_tasks_nodemask(&top_cpuset, NULL); |
| 2157 | } | 2253 | } |
| 2158 | 2254 | ||
| 2159 | /* if cpus or mems went down, we need to propagate to descendants */ | 2255 | mutex_unlock(&cpuset_mutex); |
| 2160 | if (cpus_offlined || mems_offlined) { | 2256 | |
| 2257 | /* if cpus or mems changed, we need to propagate to descendants */ | ||
| 2258 | if (cpus_updated || mems_updated) { | ||
| 2161 | struct cpuset *cs; | 2259 | struct cpuset *cs; |
| 2162 | struct cgroup *pos_cgrp; | 2260 | struct cgroup *pos_cgrp; |
| 2163 | 2261 | ||
| 2164 | rcu_read_lock(); | 2262 | rcu_read_lock(); |
| 2165 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) | 2263 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { |
| 2166 | schedule_cpuset_propagate_hotplug(cs); | 2264 | if (!css_tryget(&cs->css)) |
| 2167 | rcu_read_unlock(); | 2265 | continue; |
| 2168 | } | 2266 | rcu_read_unlock(); |
| 2169 | 2267 | ||
| 2170 | mutex_unlock(&cpuset_mutex); | 2268 | cpuset_hotplug_update_tasks(cs); |
| 2171 | 2269 | ||
| 2172 | /* wait for propagations to finish */ | 2270 | rcu_read_lock(); |
| 2173 | flush_workqueue(cpuset_propagate_hotplug_wq); | 2271 | css_put(&cs->css); |
| 2272 | } | ||
| 2273 | rcu_read_unlock(); | ||
| 2274 | } | ||
| 2174 | 2275 | ||
| 2175 | /* rebuild sched domains if cpus_allowed has changed */ | 2276 | /* rebuild sched domains if cpus_allowed has changed */ |
| 2176 | if (cpus_updated) | 2277 | if (cpus_updated) |
| @@ -2219,12 +2320,9 @@ void __init cpuset_init_smp(void) | |||
| 2219 | { | 2320 | { |
| 2220 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2321 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
| 2221 | top_cpuset.mems_allowed = node_states[N_MEMORY]; | 2322 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
| 2323 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; | ||
| 2222 | 2324 | ||
| 2223 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); | 2325 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); |
| 2224 | |||
| 2225 | cpuset_propagate_hotplug_wq = | ||
| 2226 | alloc_ordered_workqueue("cpuset_hotplug", 0); | ||
| 2227 | BUG_ON(!cpuset_propagate_hotplug_wq); | ||
| 2228 | } | 2326 | } |
| 2229 | 2327 | ||
| 2230 | /** | 2328 | /** |
| @@ -2240,21 +2338,23 @@ void __init cpuset_init_smp(void) | |||
| 2240 | 2338 | ||
| 2241 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 2339 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
| 2242 | { | 2340 | { |
| 2341 | struct cpuset *cpus_cs; | ||
| 2342 | |||
| 2243 | mutex_lock(&callback_mutex); | 2343 | mutex_lock(&callback_mutex); |
| 2244 | task_lock(tsk); | 2344 | task_lock(tsk); |
| 2245 | guarantee_online_cpus(task_cs(tsk), pmask); | 2345 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
| 2346 | guarantee_online_cpus(cpus_cs, pmask); | ||
| 2246 | task_unlock(tsk); | 2347 | task_unlock(tsk); |
| 2247 | mutex_unlock(&callback_mutex); | 2348 | mutex_unlock(&callback_mutex); |
| 2248 | } | 2349 | } |
| 2249 | 2350 | ||
| 2250 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2351 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
| 2251 | { | 2352 | { |
| 2252 | const struct cpuset *cs; | 2353 | const struct cpuset *cpus_cs; |
| 2253 | 2354 | ||
| 2254 | rcu_read_lock(); | 2355 | rcu_read_lock(); |
| 2255 | cs = task_cs(tsk); | 2356 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
| 2256 | if (cs) | 2357 | do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); |
| 2257 | do_set_cpus_allowed(tsk, cs->cpus_allowed); | ||
| 2258 | rcu_read_unlock(); | 2358 | rcu_read_unlock(); |
| 2259 | 2359 | ||
| 2260 | /* | 2360 | /* |
| @@ -2293,11 +2393,13 @@ void cpuset_init_current_mems_allowed(void) | |||
| 2293 | 2393 | ||
| 2294 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | 2394 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
| 2295 | { | 2395 | { |
| 2396 | struct cpuset *mems_cs; | ||
| 2296 | nodemask_t mask; | 2397 | nodemask_t mask; |
| 2297 | 2398 | ||
| 2298 | mutex_lock(&callback_mutex); | 2399 | mutex_lock(&callback_mutex); |
| 2299 | task_lock(tsk); | 2400 | task_lock(tsk); |
| 2300 | guarantee_online_mems(task_cs(tsk), &mask); | 2401 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); |
| 2402 | guarantee_online_mems(mems_cs, &mask); | ||
| 2301 | task_unlock(tsk); | 2403 | task_unlock(tsk); |
| 2302 | mutex_unlock(&callback_mutex); | 2404 | mutex_unlock(&callback_mutex); |
| 2303 | 2405 | ||
