diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 589 |
1 files changed, 287 insertions, 302 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f76db9dcaa05..3cf2183b472d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -97,12 +97,6 @@ struct cpuset { | |||
97 | 97 | ||
98 | struct cpuset *parent; /* my parent */ | 98 | struct cpuset *parent; /* my parent */ |
99 | 99 | ||
100 | /* | ||
101 | * Copy of global cpuset_mems_generation as of the most | ||
102 | * recent time this cpuset changed its mems_allowed. | ||
103 | */ | ||
104 | int mems_generation; | ||
105 | |||
106 | struct fmeter fmeter; /* memory_pressure filter */ | 100 | struct fmeter fmeter; /* memory_pressure filter */ |
107 | 101 | ||
108 | /* partition number for rebuild_sched_domains() */ | 102 | /* partition number for rebuild_sched_domains() */ |
@@ -128,10 +122,6 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
128 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 122 | return container_of(task_subsys_state(task, cpuset_subsys_id), |
129 | struct cpuset, css); | 123 | struct cpuset, css); |
130 | } | 124 | } |
131 | struct cpuset_hotplug_scanner { | ||
132 | struct cgroup_scanner scan; | ||
133 | struct cgroup *to; | ||
134 | }; | ||
135 | 125 | ||
136 | /* bits in struct cpuset flags field */ | 126 | /* bits in struct cpuset flags field */ |
137 | typedef enum { | 127 | typedef enum { |
@@ -180,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs) | |||
180 | return test_bit(CS_SPREAD_SLAB, &cs->flags); | 170 | return test_bit(CS_SPREAD_SLAB, &cs->flags); |
181 | } | 171 | } |
182 | 172 | ||
183 | /* | ||
184 | * Increment this integer everytime any cpuset changes its | ||
185 | * mems_allowed value. Users of cpusets can track this generation | ||
186 | * number, and avoid having to lock and reload mems_allowed unless | ||
187 | * the cpuset they're using changes generation. | ||
188 | * | ||
189 | * A single, global generation is needed because cpuset_attach_task() could | ||
190 | * reattach a task to a different cpuset, which must not have its | ||
191 | * generation numbers aliased with those of that tasks previous cpuset. | ||
192 | * | ||
193 | * Generations are needed for mems_allowed because one task cannot | ||
194 | * modify another's memory placement. So we must enable every task, | ||
195 | * on every visit to __alloc_pages(), to efficiently check whether | ||
196 | * its current->cpuset->mems_allowed has changed, requiring an update | ||
197 | * of its current->mems_allowed. | ||
198 | * | ||
199 | * Since writes to cpuset_mems_generation are guarded by the cgroup lock | ||
200 | * there is no need to mark it atomic. | ||
201 | */ | ||
202 | static int cpuset_mems_generation; | ||
203 | |||
204 | static struct cpuset top_cpuset = { | 173 | static struct cpuset top_cpuset = { |
205 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 174 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), |
206 | }; | 175 | }; |
@@ -232,8 +201,9 @@ static struct cpuset top_cpuset = { | |||
232 | * If a task is only holding callback_mutex, then it has read-only | 201 | * If a task is only holding callback_mutex, then it has read-only |
233 | * access to cpusets. | 202 | * access to cpusets. |
234 | * | 203 | * |
235 | * The task_struct fields mems_allowed and mems_generation may only | 204 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
236 | * be accessed in the context of that task, so require no locks. | 205 | * by other task, we use alloc_lock in the task_struct fields to protect |
206 | * them. | ||
237 | * | 207 | * |
238 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 208 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
239 | * small pieces of code, such as when reading out possibly multi-word | 209 | * small pieces of code, such as when reading out possibly multi-word |
@@ -335,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
335 | BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); | 305 | BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); |
336 | } | 306 | } |
337 | 307 | ||
338 | /** | 308 | /* |
339 | * cpuset_update_task_memory_state - update task memory placement | 309 | * update task's spread flag if cpuset's page/slab spread flag is set |
340 | * | 310 | * |
341 | * If the current tasks cpusets mems_allowed changed behind our | 311 | * Called with callback_mutex/cgroup_mutex held |
342 | * backs, update current->mems_allowed, mems_generation and task NUMA | ||
343 | * mempolicy to the new value. | ||
344 | * | ||
345 | * Task mempolicy is updated by rebinding it relative to the | ||
346 | * current->cpuset if a task has its memory placement changed. | ||
347 | * Do not call this routine if in_interrupt(). | ||
348 | * | ||
349 | * Call without callback_mutex or task_lock() held. May be | ||
350 | * called with or without cgroup_mutex held. Thanks in part to | ||
351 | * 'the_top_cpuset_hack', the task's cpuset pointer will never | ||
352 | * be NULL. This routine also might acquire callback_mutex during | ||
353 | * call. | ||
354 | * | ||
355 | * Reading current->cpuset->mems_generation doesn't need task_lock | ||
356 | * to guard the current->cpuset derefence, because it is guarded | ||
357 | * from concurrent freeing of current->cpuset using RCU. | ||
358 | * | ||
359 | * The rcu_dereference() is technically probably not needed, | ||
360 | * as I don't actually mind if I see a new cpuset pointer but | ||
361 | * an old value of mems_generation. However this really only | ||
362 | * matters on alpha systems using cpusets heavily. If I dropped | ||
363 | * that rcu_dereference(), it would save them a memory barrier. | ||
364 | * For all other arch's, rcu_dereference is a no-op anyway, and for | ||
365 | * alpha systems not using cpusets, another planned optimization, | ||
366 | * avoiding the rcu critical section for tasks in the root cpuset | ||
367 | * which is statically allocated, so can't vanish, will make this | ||
368 | * irrelevant. Better to use RCU as intended, than to engage in | ||
369 | * some cute trick to save a memory barrier that is impossible to | ||
370 | * test, for alpha systems using cpusets heavily, which might not | ||
371 | * even exist. | ||
372 | * | ||
373 | * This routine is needed to update the per-task mems_allowed data, | ||
374 | * within the tasks context, when it is trying to allocate memory | ||
375 | * (in various mm/mempolicy.c routines) and notices that some other | ||
376 | * task has been modifying its cpuset. | ||
377 | */ | 312 | */ |
378 | 313 | static void cpuset_update_task_spread_flag(struct cpuset *cs, | |
379 | void cpuset_update_task_memory_state(void) | 314 | struct task_struct *tsk) |
380 | { | 315 | { |
381 | int my_cpusets_mem_gen; | 316 | if (is_spread_page(cs)) |
382 | struct task_struct *tsk = current; | 317 | tsk->flags |= PF_SPREAD_PAGE; |
383 | struct cpuset *cs; | 318 | else |
384 | 319 | tsk->flags &= ~PF_SPREAD_PAGE; | |
385 | rcu_read_lock(); | 320 | if (is_spread_slab(cs)) |
386 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; | 321 | tsk->flags |= PF_SPREAD_SLAB; |
387 | rcu_read_unlock(); | 322 | else |
388 | 323 | tsk->flags &= ~PF_SPREAD_SLAB; | |
389 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | ||
390 | mutex_lock(&callback_mutex); | ||
391 | task_lock(tsk); | ||
392 | cs = task_cs(tsk); /* Maybe changed when task not locked */ | ||
393 | guarantee_online_mems(cs, &tsk->mems_allowed); | ||
394 | tsk->cpuset_mems_generation = cs->mems_generation; | ||
395 | if (is_spread_page(cs)) | ||
396 | tsk->flags |= PF_SPREAD_PAGE; | ||
397 | else | ||
398 | tsk->flags &= ~PF_SPREAD_PAGE; | ||
399 | if (is_spread_slab(cs)) | ||
400 | tsk->flags |= PF_SPREAD_SLAB; | ||
401 | else | ||
402 | tsk->flags &= ~PF_SPREAD_SLAB; | ||
403 | task_unlock(tsk); | ||
404 | mutex_unlock(&callback_mutex); | ||
405 | mpol_rebind_task(tsk, &tsk->mems_allowed); | ||
406 | } | ||
407 | } | 324 | } |
408 | 325 | ||
409 | /* | 326 | /* |
@@ -521,6 +438,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
521 | return 0; | 438 | return 0; |
522 | } | 439 | } |
523 | 440 | ||
441 | #ifdef CONFIG_SMP | ||
524 | /* | 442 | /* |
525 | * Helper routine for generate_sched_domains(). | 443 | * Helper routine for generate_sched_domains(). |
526 | * Do cpusets a, b have overlapping cpus_allowed masks? | 444 | * Do cpusets a, b have overlapping cpus_allowed masks? |
@@ -619,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
619 | * element of the partition (one sched domain) to be passed to | 537 | * element of the partition (one sched domain) to be passed to |
620 | * partition_sched_domains(). | 538 | * partition_sched_domains(). |
621 | */ | 539 | */ |
622 | /* FIXME: see the FIXME in partition_sched_domains() */ | 540 | static int generate_sched_domains(cpumask_var_t **domains, |
623 | static int generate_sched_domains(struct cpumask **domains, | ||
624 | struct sched_domain_attr **attributes) | 541 | struct sched_domain_attr **attributes) |
625 | { | 542 | { |
626 | LIST_HEAD(q); /* queue of cpusets to be scanned */ | 543 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
@@ -628,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains, | |||
628 | struct cpuset **csa; /* array of all cpuset ptrs */ | 545 | struct cpuset **csa; /* array of all cpuset ptrs */ |
629 | int csn; /* how many cpuset ptrs in csa so far */ | 546 | int csn; /* how many cpuset ptrs in csa so far */ |
630 | int i, j, k; /* indices for partition finding loops */ | 547 | int i, j, k; /* indices for partition finding loops */ |
631 | struct cpumask *doms; /* resulting partition; i.e. sched domains */ | 548 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ |
632 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 549 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
633 | int ndoms = 0; /* number of sched domains in result */ | 550 | int ndoms = 0; /* number of sched domains in result */ |
634 | int nslot; /* next empty doms[] struct cpumask slot */ | 551 | int nslot; /* next empty doms[] struct cpumask slot */ |
@@ -639,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains, | |||
639 | 556 | ||
640 | /* Special case for the 99% of systems with one, full, sched domain */ | 557 | /* Special case for the 99% of systems with one, full, sched domain */ |
641 | if (is_sched_load_balance(&top_cpuset)) { | 558 | if (is_sched_load_balance(&top_cpuset)) { |
642 | doms = kmalloc(cpumask_size(), GFP_KERNEL); | 559 | ndoms = 1; |
560 | doms = alloc_sched_domains(ndoms); | ||
643 | if (!doms) | 561 | if (!doms) |
644 | goto done; | 562 | goto done; |
645 | 563 | ||
@@ -648,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains, | |||
648 | *dattr = SD_ATTR_INIT; | 566 | *dattr = SD_ATTR_INIT; |
649 | update_domain_attr_tree(dattr, &top_cpuset); | 567 | update_domain_attr_tree(dattr, &top_cpuset); |
650 | } | 568 | } |
651 | cpumask_copy(doms, top_cpuset.cpus_allowed); | 569 | cpumask_copy(doms[0], top_cpuset.cpus_allowed); |
652 | 570 | ||
653 | ndoms = 1; | ||
654 | goto done; | 571 | goto done; |
655 | } | 572 | } |
656 | 573 | ||
@@ -718,7 +635,7 @@ restart: | |||
718 | * Now we know how many domains to create. | 635 | * Now we know how many domains to create. |
719 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | 636 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. |
720 | */ | 637 | */ |
721 | doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); | 638 | doms = alloc_sched_domains(ndoms); |
722 | if (!doms) | 639 | if (!doms) |
723 | goto done; | 640 | goto done; |
724 | 641 | ||
@@ -738,7 +655,7 @@ restart: | |||
738 | continue; | 655 | continue; |
739 | } | 656 | } |
740 | 657 | ||
741 | dp = doms + nslot; | 658 | dp = doms[nslot]; |
742 | 659 | ||
743 | if (nslot == ndoms) { | 660 | if (nslot == ndoms) { |
744 | static int warnings = 10; | 661 | static int warnings = 10; |
@@ -800,7 +717,7 @@ done: | |||
800 | static void do_rebuild_sched_domains(struct work_struct *unused) | 717 | static void do_rebuild_sched_domains(struct work_struct *unused) |
801 | { | 718 | { |
802 | struct sched_domain_attr *attr; | 719 | struct sched_domain_attr *attr; |
803 | struct cpumask *doms; | 720 | cpumask_var_t *doms; |
804 | int ndoms; | 721 | int ndoms; |
805 | 722 | ||
806 | get_online_cpus(); | 723 | get_online_cpus(); |
@@ -815,6 +732,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused) | |||
815 | 732 | ||
816 | put_online_cpus(); | 733 | put_online_cpus(); |
817 | } | 734 | } |
735 | #else /* !CONFIG_SMP */ | ||
736 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
737 | { | ||
738 | } | ||
739 | |||
740 | static int generate_sched_domains(struct cpumask **domains, | ||
741 | struct sched_domain_attr **attributes) | ||
742 | { | ||
743 | *domains = NULL; | ||
744 | return 1; | ||
745 | } | ||
746 | #endif /* CONFIG_SMP */ | ||
818 | 747 | ||
819 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); | 748 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); |
820 | 749 | ||
@@ -998,14 +927,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
998 | * other task, the task_struct mems_allowed that we are hacking | 927 | * other task, the task_struct mems_allowed that we are hacking |
999 | * is for our current task, which must allocate new pages for that | 928 | * is for our current task, which must allocate new pages for that |
1000 | * migrating memory region. | 929 | * migrating memory region. |
1001 | * | ||
1002 | * We call cpuset_update_task_memory_state() before hacking | ||
1003 | * our tasks mems_allowed, so that we are assured of being in | ||
1004 | * sync with our tasks cpuset, and in particular, callbacks to | ||
1005 | * cpuset_update_task_memory_state() from nested page allocations | ||
1006 | * won't see any mismatch of our cpuset and task mems_generation | ||
1007 | * values, so won't overwrite our hacked tasks mems_allowed | ||
1008 | * nodemask. | ||
1009 | */ | 930 | */ |
1010 | 931 | ||
1011 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | 932 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
@@ -1013,17 +934,64 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
1013 | { | 934 | { |
1014 | struct task_struct *tsk = current; | 935 | struct task_struct *tsk = current; |
1015 | 936 | ||
1016 | cpuset_update_task_memory_state(); | ||
1017 | |||
1018 | mutex_lock(&callback_mutex); | ||
1019 | tsk->mems_allowed = *to; | 937 | tsk->mems_allowed = *to; |
1020 | mutex_unlock(&callback_mutex); | ||
1021 | 938 | ||
1022 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 939 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
1023 | 940 | ||
1024 | mutex_lock(&callback_mutex); | ||
1025 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); | 941 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); |
1026 | mutex_unlock(&callback_mutex); | 942 | } |
943 | |||
944 | /* | ||
945 | * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy | ||
946 | * @tsk: the task to change | ||
947 | * @newmems: new nodes that the task will be set | ||
948 | * | ||
949 | * In order to avoid seeing no nodes if the old and new nodes are disjoint, | ||
950 | * we structure updates as setting all new allowed nodes, then clearing newly | ||
951 | * disallowed ones. | ||
952 | * | ||
953 | * Called with task's alloc_lock held | ||
954 | */ | ||
955 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | ||
956 | nodemask_t *newmems) | ||
957 | { | ||
958 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | ||
959 | mpol_rebind_task(tsk, &tsk->mems_allowed); | ||
960 | mpol_rebind_task(tsk, newmems); | ||
961 | tsk->mems_allowed = *newmems; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | ||
966 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | ||
967 | * memory_migrate flag is set. Called with cgroup_mutex held. | ||
968 | */ | ||
969 | static void cpuset_change_nodemask(struct task_struct *p, | ||
970 | struct cgroup_scanner *scan) | ||
971 | { | ||
972 | struct mm_struct *mm; | ||
973 | struct cpuset *cs; | ||
974 | int migrate; | ||
975 | const nodemask_t *oldmem = scan->data; | ||
976 | nodemask_t newmems; | ||
977 | |||
978 | cs = cgroup_cs(scan->cg); | ||
979 | guarantee_online_mems(cs, &newmems); | ||
980 | |||
981 | task_lock(p); | ||
982 | cpuset_change_task_nodemask(p, &newmems); | ||
983 | task_unlock(p); | ||
984 | |||
985 | mm = get_task_mm(p); | ||
986 | if (!mm) | ||
987 | return; | ||
988 | |||
989 | migrate = is_memory_migrate(cs); | ||
990 | |||
991 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
992 | if (migrate) | ||
993 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); | ||
994 | mmput(mm); | ||
1027 | } | 995 | } |
1028 | 996 | ||
1029 | static void *cpuset_being_rebound; | 997 | static void *cpuset_being_rebound; |
@@ -1032,104 +1000,48 @@ static void *cpuset_being_rebound; | |||
1032 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 1000 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
1033 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 1001 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
1034 | * @oldmem: old mems_allowed of cpuset cs | 1002 | * @oldmem: old mems_allowed of cpuset cs |
1003 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | ||
1035 | * | 1004 | * |
1036 | * Called with cgroup_mutex held | 1005 | * Called with cgroup_mutex held |
1037 | * Return 0 if successful, -errno if not. | 1006 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
1007 | * if @heap != NULL. | ||
1038 | */ | 1008 | */ |
1039 | static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) | 1009 | static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, |
1010 | struct ptr_heap *heap) | ||
1040 | { | 1011 | { |
1041 | struct task_struct *p; | 1012 | struct cgroup_scanner scan; |
1042 | struct mm_struct **mmarray; | ||
1043 | int i, n, ntasks; | ||
1044 | int migrate; | ||
1045 | int fudge; | ||
1046 | struct cgroup_iter it; | ||
1047 | int retval; | ||
1048 | 1013 | ||
1049 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1014 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1050 | 1015 | ||
1051 | fudge = 10; /* spare mmarray[] slots */ | 1016 | scan.cg = cs->css.cgroup; |
1052 | fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ | 1017 | scan.test_task = NULL; |
1053 | retval = -ENOMEM; | 1018 | scan.process_task = cpuset_change_nodemask; |
1054 | 1019 | scan.heap = heap; | |
1055 | /* | 1020 | scan.data = (nodemask_t *)oldmem; |
1056 | * Allocate mmarray[] to hold mm reference for each task | ||
1057 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
1058 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
1059 | * few more lines of code, we can retry until we get a big | ||
1060 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
1061 | */ | ||
1062 | while (1) { | ||
1063 | ntasks = cgroup_task_count(cs->css.cgroup); /* guess */ | ||
1064 | ntasks += fudge; | ||
1065 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
1066 | if (!mmarray) | ||
1067 | goto done; | ||
1068 | read_lock(&tasklist_lock); /* block fork */ | ||
1069 | if (cgroup_task_count(cs->css.cgroup) <= ntasks) | ||
1070 | break; /* got enough */ | ||
1071 | read_unlock(&tasklist_lock); /* try again */ | ||
1072 | kfree(mmarray); | ||
1073 | } | ||
1074 | |||
1075 | n = 0; | ||
1076 | |||
1077 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
1078 | cgroup_iter_start(cs->css.cgroup, &it); | ||
1079 | while ((p = cgroup_iter_next(cs->css.cgroup, &it))) { | ||
1080 | struct mm_struct *mm; | ||
1081 | |||
1082 | if (n >= ntasks) { | ||
1083 | printk(KERN_WARNING | ||
1084 | "Cpuset mempolicy rebind incomplete.\n"); | ||
1085 | break; | ||
1086 | } | ||
1087 | mm = get_task_mm(p); | ||
1088 | if (!mm) | ||
1089 | continue; | ||
1090 | mmarray[n++] = mm; | ||
1091 | } | ||
1092 | cgroup_iter_end(cs->css.cgroup, &it); | ||
1093 | read_unlock(&tasklist_lock); | ||
1094 | 1021 | ||
1095 | /* | 1022 | /* |
1096 | * Now that we've dropped the tasklist spinlock, we can | 1023 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
1097 | * rebind the vma mempolicies of each mm in mmarray[] to their | 1024 | * take while holding tasklist_lock. Forks can happen - the |
1098 | * new cpuset, and release that mm. The mpol_rebind_mm() | 1025 | * mpol_dup() cpuset_being_rebound check will catch such forks, |
1099 | * call takes mmap_sem, which we couldn't take while holding | 1026 | * and rebind their vma mempolicies too. Because we still hold |
1100 | * tasklist_lock. Forks can happen again now - the mpol_dup() | 1027 | * the global cgroup_mutex, we know that no other rebind effort |
1101 | * cpuset_being_rebound check will catch such forks, and rebind | 1028 | * will be contending for the global variable cpuset_being_rebound. |
1102 | * their vma mempolicies too. Because we still hold the global | ||
1103 | * cgroup_mutex, we know that no other rebind effort will | ||
1104 | * be contending for the global variable cpuset_being_rebound. | ||
1105 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1029 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1106 | * is idempotent. Also migrate pages in each mm to new nodes. | 1030 | * is idempotent. Also migrate pages in each mm to new nodes. |
1107 | */ | 1031 | */ |
1108 | migrate = is_memory_migrate(cs); | 1032 | cgroup_scan_tasks(&scan); |
1109 | for (i = 0; i < n; i++) { | ||
1110 | struct mm_struct *mm = mmarray[i]; | ||
1111 | |||
1112 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
1113 | if (migrate) | ||
1114 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); | ||
1115 | mmput(mm); | ||
1116 | } | ||
1117 | 1033 | ||
1118 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ | 1034 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
1119 | kfree(mmarray); | ||
1120 | cpuset_being_rebound = NULL; | 1035 | cpuset_being_rebound = NULL; |
1121 | retval = 0; | ||
1122 | done: | ||
1123 | return retval; | ||
1124 | } | 1036 | } |
1125 | 1037 | ||
1126 | /* | 1038 | /* |
1127 | * Handle user request to change the 'mems' memory placement | 1039 | * Handle user request to change the 'mems' memory placement |
1128 | * of a cpuset. Needs to validate the request, update the | 1040 | * of a cpuset. Needs to validate the request, update the |
1129 | * cpusets mems_allowed and mems_generation, and for each | 1041 | * cpusets mems_allowed, and for each task in the cpuset, |
1130 | * task in the cpuset, rebind any vma mempolicies and if | 1042 | * update mems_allowed and rebind task's mempolicy and any vma |
1131 | * the cpuset is marked 'memory_migrate', migrate the tasks | 1043 | * mempolicies and if the cpuset is marked 'memory_migrate', |
1132 | * pages to the new memory. | 1044 | * migrate the tasks pages to the new memory. |
1133 | * | 1045 | * |
1134 | * Call with cgroup_mutex held. May take callback_mutex during call. | 1046 | * Call with cgroup_mutex held. May take callback_mutex during call. |
1135 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1047 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
@@ -1141,6 +1053,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1141 | { | 1053 | { |
1142 | nodemask_t oldmem; | 1054 | nodemask_t oldmem; |
1143 | int retval; | 1055 | int retval; |
1056 | struct ptr_heap heap; | ||
1144 | 1057 | ||
1145 | /* | 1058 | /* |
1146 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | 1059 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; |
@@ -1175,12 +1088,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1175 | if (retval < 0) | 1088 | if (retval < 0) |
1176 | goto done; | 1089 | goto done; |
1177 | 1090 | ||
1091 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
1092 | if (retval < 0) | ||
1093 | goto done; | ||
1094 | |||
1178 | mutex_lock(&callback_mutex); | 1095 | mutex_lock(&callback_mutex); |
1179 | cs->mems_allowed = trialcs->mems_allowed; | 1096 | cs->mems_allowed = trialcs->mems_allowed; |
1180 | cs->mems_generation = cpuset_mems_generation++; | ||
1181 | mutex_unlock(&callback_mutex); | 1097 | mutex_unlock(&callback_mutex); |
1182 | 1098 | ||
1183 | retval = update_tasks_nodemask(cs, &oldmem); | 1099 | update_tasks_nodemask(cs, &oldmem, &heap); |
1100 | |||
1101 | heap_free(&heap); | ||
1184 | done: | 1102 | done: |
1185 | return retval; | 1103 | return retval; |
1186 | } | 1104 | } |
@@ -1192,8 +1110,10 @@ int current_cpuset_is_being_rebound(void) | |||
1192 | 1110 | ||
1193 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1111 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
1194 | { | 1112 | { |
1113 | #ifdef CONFIG_SMP | ||
1195 | if (val < -1 || val >= SD_LV_MAX) | 1114 | if (val < -1 || val >= SD_LV_MAX) |
1196 | return -EINVAL; | 1115 | return -EINVAL; |
1116 | #endif | ||
1197 | 1117 | ||
1198 | if (val != cs->relax_domain_level) { | 1118 | if (val != cs->relax_domain_level) { |
1199 | cs->relax_domain_level = val; | 1119 | cs->relax_domain_level = val; |
@@ -1206,6 +1126,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1206 | } | 1126 | } |
1207 | 1127 | ||
1208 | /* | 1128 | /* |
1129 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's | ||
1130 | * @tsk: task to be updated | ||
1131 | * @scan: struct cgroup_scanner containing the cgroup of the task | ||
1132 | * | ||
1133 | * Called by cgroup_scan_tasks() for each task in a cgroup. | ||
1134 | * | ||
1135 | * We don't need to re-check for the cgroup/cpuset membership, since we're | ||
1136 | * holding cgroup_lock() at this point. | ||
1137 | */ | ||
1138 | static void cpuset_change_flag(struct task_struct *tsk, | ||
1139 | struct cgroup_scanner *scan) | ||
1140 | { | ||
1141 | cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); | ||
1142 | } | ||
1143 | |||
1144 | /* | ||
1145 | * update_tasks_flags - update the spread flags of tasks in the cpuset. | ||
1146 | * @cs: the cpuset in which each task's spread flags needs to be changed | ||
1147 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | ||
1148 | * | ||
1149 | * Called with cgroup_mutex held | ||
1150 | * | ||
1151 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
1152 | * calling callback functions for each. | ||
1153 | * | ||
1154 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | ||
1155 | * if @heap != NULL. | ||
1156 | */ | ||
1157 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | ||
1158 | { | ||
1159 | struct cgroup_scanner scan; | ||
1160 | |||
1161 | scan.cg = cs->css.cgroup; | ||
1162 | scan.test_task = NULL; | ||
1163 | scan.process_task = cpuset_change_flag; | ||
1164 | scan.heap = heap; | ||
1165 | cgroup_scan_tasks(&scan); | ||
1166 | } | ||
1167 | |||
1168 | /* | ||
1209 | * update_flag - read a 0 or a 1 in a file and update associated flag | 1169 | * update_flag - read a 0 or a 1 in a file and update associated flag |
1210 | * bit: the bit to update (see cpuset_flagbits_t) | 1170 | * bit: the bit to update (see cpuset_flagbits_t) |
1211 | * cs: the cpuset to update | 1171 | * cs: the cpuset to update |
@@ -1218,8 +1178,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1218 | int turning_on) | 1178 | int turning_on) |
1219 | { | 1179 | { |
1220 | struct cpuset *trialcs; | 1180 | struct cpuset *trialcs; |
1221 | int err; | ||
1222 | int balance_flag_changed; | 1181 | int balance_flag_changed; |
1182 | int spread_flag_changed; | ||
1183 | struct ptr_heap heap; | ||
1184 | int err; | ||
1223 | 1185 | ||
1224 | trialcs = alloc_trial_cpuset(cs); | 1186 | trialcs = alloc_trial_cpuset(cs); |
1225 | if (!trialcs) | 1187 | if (!trialcs) |
@@ -1234,9 +1196,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1234 | if (err < 0) | 1196 | if (err < 0) |
1235 | goto out; | 1197 | goto out; |
1236 | 1198 | ||
1199 | err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
1200 | if (err < 0) | ||
1201 | goto out; | ||
1202 | |||
1237 | balance_flag_changed = (is_sched_load_balance(cs) != | 1203 | balance_flag_changed = (is_sched_load_balance(cs) != |
1238 | is_sched_load_balance(trialcs)); | 1204 | is_sched_load_balance(trialcs)); |
1239 | 1205 | ||
1206 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) | ||
1207 | || (is_spread_page(cs) != is_spread_page(trialcs))); | ||
1208 | |||
1240 | mutex_lock(&callback_mutex); | 1209 | mutex_lock(&callback_mutex); |
1241 | cs->flags = trialcs->flags; | 1210 | cs->flags = trialcs->flags; |
1242 | mutex_unlock(&callback_mutex); | 1211 | mutex_unlock(&callback_mutex); |
@@ -1244,6 +1213,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1244 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) | 1213 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
1245 | async_rebuild_sched_domains(); | 1214 | async_rebuild_sched_domains(); |
1246 | 1215 | ||
1216 | if (spread_flag_changed) | ||
1217 | update_tasks_flags(cs, &heap); | ||
1218 | heap_free(&heap); | ||
1247 | out: | 1219 | out: |
1248 | free_trial_cpuset(trialcs); | 1220 | free_trial_cpuset(trialcs); |
1249 | return err; | 1221 | return err; |
@@ -1351,46 +1323,92 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1351 | static cpumask_var_t cpus_attach; | 1323 | static cpumask_var_t cpus_attach; |
1352 | 1324 | ||
1353 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | 1325 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
1354 | static int cpuset_can_attach(struct cgroup_subsys *ss, | 1326 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
1355 | struct cgroup *cont, struct task_struct *tsk) | 1327 | struct task_struct *tsk, bool threadgroup) |
1356 | { | 1328 | { |
1329 | int ret; | ||
1357 | struct cpuset *cs = cgroup_cs(cont); | 1330 | struct cpuset *cs = cgroup_cs(cont); |
1358 | int ret = 0; | ||
1359 | 1331 | ||
1360 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1332 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
1361 | return -ENOSPC; | 1333 | return -ENOSPC; |
1362 | 1334 | ||
1363 | if (tsk->flags & PF_THREAD_BOUND) { | 1335 | /* |
1364 | mutex_lock(&callback_mutex); | 1336 | * Kthreads bound to specific cpus cannot be moved to a new cpuset; we |
1365 | if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) | 1337 | * cannot change their cpu affinity and isolating such threads by their |
1366 | ret = -EINVAL; | 1338 | * set of allowed nodes is unnecessary. Thus, cpusets are not |
1367 | mutex_unlock(&callback_mutex); | 1339 | * applicable for such threads. This prevents checking for success of |
1340 | * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may | ||
1341 | * be changed. | ||
1342 | */ | ||
1343 | if (tsk->flags & PF_THREAD_BOUND) | ||
1344 | return -EINVAL; | ||
1345 | |||
1346 | ret = security_task_setscheduler(tsk, 0, NULL); | ||
1347 | if (ret) | ||
1348 | return ret; | ||
1349 | if (threadgroup) { | ||
1350 | struct task_struct *c; | ||
1351 | |||
1352 | rcu_read_lock(); | ||
1353 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
1354 | ret = security_task_setscheduler(c, 0, NULL); | ||
1355 | if (ret) { | ||
1356 | rcu_read_unlock(); | ||
1357 | return ret; | ||
1358 | } | ||
1359 | } | ||
1360 | rcu_read_unlock(); | ||
1368 | } | 1361 | } |
1362 | return 0; | ||
1363 | } | ||
1364 | |||
1365 | static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | ||
1366 | struct cpuset *cs) | ||
1367 | { | ||
1368 | int err; | ||
1369 | /* | ||
1370 | * can_attach beforehand should guarantee that this doesn't fail. | ||
1371 | * TODO: have a better way to handle failure here | ||
1372 | */ | ||
1373 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | ||
1374 | WARN_ON_ONCE(err); | ||
1375 | |||
1376 | task_lock(tsk); | ||
1377 | cpuset_change_task_nodemask(tsk, to); | ||
1378 | task_unlock(tsk); | ||
1379 | cpuset_update_task_spread_flag(cs, tsk); | ||
1369 | 1380 | ||
1370 | return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); | ||
1371 | } | 1381 | } |
1372 | 1382 | ||
1373 | static void cpuset_attach(struct cgroup_subsys *ss, | 1383 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
1374 | struct cgroup *cont, struct cgroup *oldcont, | 1384 | struct cgroup *oldcont, struct task_struct *tsk, |
1375 | struct task_struct *tsk) | 1385 | bool threadgroup) |
1376 | { | 1386 | { |
1377 | nodemask_t from, to; | 1387 | nodemask_t from, to; |
1378 | struct mm_struct *mm; | 1388 | struct mm_struct *mm; |
1379 | struct cpuset *cs = cgroup_cs(cont); | 1389 | struct cpuset *cs = cgroup_cs(cont); |
1380 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1390 | struct cpuset *oldcs = cgroup_cs(oldcont); |
1381 | int err; | ||
1382 | 1391 | ||
1383 | if (cs == &top_cpuset) { | 1392 | if (cs == &top_cpuset) { |
1384 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1393 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1394 | to = node_possible_map; | ||
1385 | } else { | 1395 | } else { |
1386 | mutex_lock(&callback_mutex); | ||
1387 | guarantee_online_cpus(cs, cpus_attach); | 1396 | guarantee_online_cpus(cs, cpus_attach); |
1388 | mutex_unlock(&callback_mutex); | 1397 | guarantee_online_mems(cs, &to); |
1398 | } | ||
1399 | |||
1400 | /* do per-task migration stuff possibly for each in the threadgroup */ | ||
1401 | cpuset_attach_task(tsk, &to, cs); | ||
1402 | if (threadgroup) { | ||
1403 | struct task_struct *c; | ||
1404 | rcu_read_lock(); | ||
1405 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
1406 | cpuset_attach_task(c, &to, cs); | ||
1407 | } | ||
1408 | rcu_read_unlock(); | ||
1389 | } | 1409 | } |
1390 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | ||
1391 | if (err) | ||
1392 | return; | ||
1393 | 1410 | ||
1411 | /* change mm; only needs to be done once even if threadgroup */ | ||
1394 | from = oldcs->mems_allowed; | 1412 | from = oldcs->mems_allowed; |
1395 | to = cs->mems_allowed; | 1413 | to = cs->mems_allowed; |
1396 | mm = get_task_mm(tsk); | 1414 | mm = get_task_mm(tsk); |
@@ -1452,11 +1470,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | |||
1452 | break; | 1470 | break; |
1453 | case FILE_SPREAD_PAGE: | 1471 | case FILE_SPREAD_PAGE: |
1454 | retval = update_flag(CS_SPREAD_PAGE, cs, val); | 1472 | retval = update_flag(CS_SPREAD_PAGE, cs, val); |
1455 | cs->mems_generation = cpuset_mems_generation++; | ||
1456 | break; | 1473 | break; |
1457 | case FILE_SPREAD_SLAB: | 1474 | case FILE_SPREAD_SLAB: |
1458 | retval = update_flag(CS_SPREAD_SLAB, cs, val); | 1475 | retval = update_flag(CS_SPREAD_SLAB, cs, val); |
1459 | cs->mems_generation = cpuset_mems_generation++; | ||
1460 | break; | 1476 | break; |
1461 | default: | 1477 | default: |
1462 | retval = -EINVAL; | 1478 | retval = -EINVAL; |
@@ -1706,6 +1722,7 @@ static struct cftype files[] = { | |||
1706 | .read_u64 = cpuset_read_u64, | 1722 | .read_u64 = cpuset_read_u64, |
1707 | .write_u64 = cpuset_write_u64, | 1723 | .write_u64 = cpuset_write_u64, |
1708 | .private = FILE_MEMORY_PRESSURE, | 1724 | .private = FILE_MEMORY_PRESSURE, |
1725 | .mode = S_IRUGO, | ||
1709 | }, | 1726 | }, |
1710 | 1727 | ||
1711 | { | 1728 | { |
@@ -1795,8 +1812,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1795 | struct cpuset *parent; | 1812 | struct cpuset *parent; |
1796 | 1813 | ||
1797 | if (!cont->parent) { | 1814 | if (!cont->parent) { |
1798 | /* This is early initialization for the top cgroup */ | ||
1799 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1800 | return &top_cpuset.css; | 1815 | return &top_cpuset.css; |
1801 | } | 1816 | } |
1802 | parent = cgroup_cs(cont->parent); | 1817 | parent = cgroup_cs(cont->parent); |
@@ -1808,7 +1823,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1808 | return ERR_PTR(-ENOMEM); | 1823 | return ERR_PTR(-ENOMEM); |
1809 | } | 1824 | } |
1810 | 1825 | ||
1811 | cpuset_update_task_memory_state(); | ||
1812 | cs->flags = 0; | 1826 | cs->flags = 0; |
1813 | if (is_spread_page(parent)) | 1827 | if (is_spread_page(parent)) |
1814 | set_bit(CS_SPREAD_PAGE, &cs->flags); | 1828 | set_bit(CS_SPREAD_PAGE, &cs->flags); |
@@ -1817,7 +1831,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1817 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1831 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1818 | cpumask_clear(cs->cpus_allowed); | 1832 | cpumask_clear(cs->cpus_allowed); |
1819 | nodes_clear(cs->mems_allowed); | 1833 | nodes_clear(cs->mems_allowed); |
1820 | cs->mems_generation = cpuset_mems_generation++; | ||
1821 | fmeter_init(&cs->fmeter); | 1834 | fmeter_init(&cs->fmeter); |
1822 | cs->relax_domain_level = -1; | 1835 | cs->relax_domain_level = -1; |
1823 | 1836 | ||
@@ -1836,8 +1849,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1836 | { | 1849 | { |
1837 | struct cpuset *cs = cgroup_cs(cont); | 1850 | struct cpuset *cs = cgroup_cs(cont); |
1838 | 1851 | ||
1839 | cpuset_update_task_memory_state(); | ||
1840 | |||
1841 | if (is_sched_load_balance(cs)) | 1852 | if (is_sched_load_balance(cs)) |
1842 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1853 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
1843 | 1854 | ||
@@ -1858,21 +1869,6 @@ struct cgroup_subsys cpuset_subsys = { | |||
1858 | .early_init = 1, | 1869 | .early_init = 1, |
1859 | }; | 1870 | }; |
1860 | 1871 | ||
1861 | /* | ||
1862 | * cpuset_init_early - just enough so that the calls to | ||
1863 | * cpuset_update_task_memory_state() in early init code | ||
1864 | * are harmless. | ||
1865 | */ | ||
1866 | |||
1867 | int __init cpuset_init_early(void) | ||
1868 | { | ||
1869 | alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); | ||
1870 | |||
1871 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1872 | return 0; | ||
1873 | } | ||
1874 | |||
1875 | |||
1876 | /** | 1872 | /** |
1877 | * cpuset_init - initialize cpusets at system boot | 1873 | * cpuset_init - initialize cpusets at system boot |
1878 | * | 1874 | * |
@@ -1883,11 +1879,13 @@ int __init cpuset_init(void) | |||
1883 | { | 1879 | { |
1884 | int err = 0; | 1880 | int err = 0; |
1885 | 1881 | ||
1882 | if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) | ||
1883 | BUG(); | ||
1884 | |||
1886 | cpumask_setall(top_cpuset.cpus_allowed); | 1885 | cpumask_setall(top_cpuset.cpus_allowed); |
1887 | nodes_setall(top_cpuset.mems_allowed); | 1886 | nodes_setall(top_cpuset.mems_allowed); |
1888 | 1887 | ||
1889 | fmeter_init(&top_cpuset.fmeter); | 1888 | fmeter_init(&top_cpuset.fmeter); |
1890 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1891 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 1889 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
1892 | top_cpuset.relax_domain_level = -1; | 1890 | top_cpuset.relax_domain_level = -1; |
1893 | 1891 | ||
@@ -1913,10 +1911,9 @@ int __init cpuset_init(void) | |||
1913 | static void cpuset_do_move_task(struct task_struct *tsk, | 1911 | static void cpuset_do_move_task(struct task_struct *tsk, |
1914 | struct cgroup_scanner *scan) | 1912 | struct cgroup_scanner *scan) |
1915 | { | 1913 | { |
1916 | struct cpuset_hotplug_scanner *chsp; | 1914 | struct cgroup *new_cgroup = scan->data; |
1917 | 1915 | ||
1918 | chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); | 1916 | cgroup_attach_task(new_cgroup, tsk); |
1919 | cgroup_attach_task(chsp->to, tsk); | ||
1920 | } | 1917 | } |
1921 | 1918 | ||
1922 | /** | 1919 | /** |
@@ -1932,15 +1929,15 @@ static void cpuset_do_move_task(struct task_struct *tsk, | |||
1932 | */ | 1929 | */ |
1933 | static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | 1930 | static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) |
1934 | { | 1931 | { |
1935 | struct cpuset_hotplug_scanner scan; | 1932 | struct cgroup_scanner scan; |
1936 | 1933 | ||
1937 | scan.scan.cg = from->css.cgroup; | 1934 | scan.cg = from->css.cgroup; |
1938 | scan.scan.test_task = NULL; /* select all tasks in cgroup */ | 1935 | scan.test_task = NULL; /* select all tasks in cgroup */ |
1939 | scan.scan.process_task = cpuset_do_move_task; | 1936 | scan.process_task = cpuset_do_move_task; |
1940 | scan.scan.heap = NULL; | 1937 | scan.heap = NULL; |
1941 | scan.to = to->css.cgroup; | 1938 | scan.data = to->css.cgroup; |
1942 | 1939 | ||
1943 | if (cgroup_scan_tasks(&scan.scan)) | 1940 | if (cgroup_scan_tasks(&scan)) |
1944 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | 1941 | printk(KERN_ERR "move_member_tasks_to_cpuset: " |
1945 | "cgroup_scan_tasks failed\n"); | 1942 | "cgroup_scan_tasks failed\n"); |
1946 | } | 1943 | } |
@@ -2033,7 +2030,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2033 | remove_tasks_in_empty_cpuset(cp); | 2030 | remove_tasks_in_empty_cpuset(cp); |
2034 | else { | 2031 | else { |
2035 | update_tasks_cpumask(cp, NULL); | 2032 | update_tasks_cpumask(cp, NULL); |
2036 | update_tasks_nodemask(cp, &oldmems); | 2033 | update_tasks_nodemask(cp, &oldmems, NULL); |
2037 | } | 2034 | } |
2038 | } | 2035 | } |
2039 | } | 2036 | } |
@@ -2054,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2054 | unsigned long phase, void *unused_cpu) | 2051 | unsigned long phase, void *unused_cpu) |
2055 | { | 2052 | { |
2056 | struct sched_domain_attr *attr; | 2053 | struct sched_domain_attr *attr; |
2057 | struct cpumask *doms; | 2054 | cpumask_var_t *doms; |
2058 | int ndoms; | 2055 | int ndoms; |
2059 | 2056 | ||
2060 | switch (phase) { | 2057 | switch (phase) { |
@@ -2069,7 +2066,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2069 | } | 2066 | } |
2070 | 2067 | ||
2071 | cgroup_lock(); | 2068 | cgroup_lock(); |
2069 | mutex_lock(&callback_mutex); | ||
2072 | cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); | 2070 | cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); |
2071 | mutex_unlock(&callback_mutex); | ||
2073 | scan_for_empty_cpusets(&top_cpuset); | 2072 | scan_for_empty_cpusets(&top_cpuset); |
2074 | ndoms = generate_sched_domains(&doms, &attr); | 2073 | ndoms = generate_sched_domains(&doms, &attr); |
2075 | cgroup_unlock(); | 2074 | cgroup_unlock(); |
@@ -2092,11 +2091,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2092 | cgroup_lock(); | 2091 | cgroup_lock(); |
2093 | switch (action) { | 2092 | switch (action) { |
2094 | case MEM_ONLINE: | 2093 | case MEM_ONLINE: |
2095 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
2096 | break; | ||
2097 | case MEM_OFFLINE: | 2094 | case MEM_OFFLINE: |
2095 | mutex_lock(&callback_mutex); | ||
2098 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2096 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2099 | scan_for_empty_cpusets(&top_cpuset); | 2097 | mutex_unlock(&callback_mutex); |
2098 | if (action == MEM_OFFLINE) | ||
2099 | scan_for_empty_cpusets(&top_cpuset); | ||
2100 | break; | 2100 | break; |
2101 | default: | 2101 | default: |
2102 | break; | 2102 | break; |
@@ -2206,26 +2206,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | |||
2206 | } | 2206 | } |
2207 | 2207 | ||
2208 | /** | 2208 | /** |
2209 | * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? | 2209 | * cpuset_node_allowed_softwall - Can we allocate on a memory node? |
2210 | * @z: is this zone on an allowed node? | 2210 | * @node: is this an allowed node? |
2211 | * @gfp_mask: memory allocation flags | 2211 | * @gfp_mask: memory allocation flags |
2212 | * | 2212 | * |
2213 | * If we're in interrupt, yes, we can always allocate. If | 2213 | * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is |
2214 | * __GFP_THISNODE is set, yes, we can always allocate. If zone | 2214 | * set, yes, we can always allocate. If node is in our task's mems_allowed, |
2215 | * z's node is in our tasks mems_allowed, yes. If it's not a | 2215 | * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest |
2216 | * __GFP_HARDWALL request and this zone's nodes is in the nearest | 2216 | * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been |
2217 | * hardwalled cpuset ancestor to this tasks cpuset, yes. | 2217 | * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE |
2218 | * If the task has been OOM killed and has access to memory reserves | 2218 | * flag, yes. |
2219 | * as specified by the TIF_MEMDIE flag, yes. | ||
2220 | * Otherwise, no. | 2219 | * Otherwise, no. |
2221 | * | 2220 | * |
2222 | * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() | 2221 | * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to |
2223 | * reduces to cpuset_zone_allowed_hardwall(). Otherwise, | 2222 | * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() |
2224 | * cpuset_zone_allowed_softwall() might sleep, and might allow a zone | 2223 | * might sleep, and might allow a node from an enclosing cpuset. |
2225 | * from an enclosing cpuset. | ||
2226 | * | 2224 | * |
2227 | * cpuset_zone_allowed_hardwall() only handles the simpler case of | 2225 | * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall |
2228 | * hardwall cpusets, and never sleeps. | 2226 | * cpusets, and never sleeps. |
2229 | * | 2227 | * |
2230 | * The __GFP_THISNODE placement logic is really handled elsewhere, | 2228 | * The __GFP_THISNODE placement logic is really handled elsewhere, |
2231 | * by forcibly using a zonelist starting at a specified node, and by | 2229 | * by forcibly using a zonelist starting at a specified node, and by |
@@ -2264,20 +2262,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | |||
2264 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2262 | * GFP_USER - only nodes in current tasks mems allowed ok. |
2265 | * | 2263 | * |
2266 | * Rule: | 2264 | * Rule: |
2267 | * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you | 2265 | * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you |
2268 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables | 2266 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables |
2269 | * the code that might scan up ancestor cpusets and sleep. | 2267 | * the code that might scan up ancestor cpusets and sleep. |
2270 | */ | 2268 | */ |
2271 | 2269 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | |
2272 | int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) | ||
2273 | { | 2270 | { |
2274 | int node; /* node that zone z is on */ | ||
2275 | const struct cpuset *cs; /* current cpuset ancestors */ | 2271 | const struct cpuset *cs; /* current cpuset ancestors */ |
2276 | int allowed; /* is allocation in zone z allowed? */ | 2272 | int allowed; /* is allocation in zone z allowed? */ |
2277 | 2273 | ||
2278 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2274 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
2279 | return 1; | 2275 | return 1; |
2280 | node = zone_to_nid(z); | ||
2281 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | 2276 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); |
2282 | if (node_isset(node, current->mems_allowed)) | 2277 | if (node_isset(node, current->mems_allowed)) |
2283 | return 1; | 2278 | return 1; |
@@ -2306,15 +2301,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) | |||
2306 | } | 2301 | } |
2307 | 2302 | ||
2308 | /* | 2303 | /* |
2309 | * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? | 2304 | * cpuset_node_allowed_hardwall - Can we allocate on a memory node? |
2310 | * @z: is this zone on an allowed node? | 2305 | * @node: is this an allowed node? |
2311 | * @gfp_mask: memory allocation flags | 2306 | * @gfp_mask: memory allocation flags |
2312 | * | 2307 | * |
2313 | * If we're in interrupt, yes, we can always allocate. | 2308 | * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is |
2314 | * If __GFP_THISNODE is set, yes, we can always allocate. If zone | 2309 | * set, yes, we can always allocate. If node is in our task's mems_allowed, |
2315 | * z's node is in our tasks mems_allowed, yes. If the task has been | 2310 | * yes. If the task has been OOM killed and has access to memory reserves as |
2316 | * OOM killed and has access to memory reserves as specified by the | 2311 | * specified by the TIF_MEMDIE flag, yes. |
2317 | * TIF_MEMDIE flag, yes. Otherwise, no. | 2312 | * Otherwise, no. |
2318 | * | 2313 | * |
2319 | * The __GFP_THISNODE placement logic is really handled elsewhere, | 2314 | * The __GFP_THISNODE placement logic is really handled elsewhere, |
2320 | * by forcibly using a zonelist starting at a specified node, and by | 2315 | * by forcibly using a zonelist starting at a specified node, and by |
@@ -2322,20 +2317,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) | |||
2322 | * any node on the zonelist except the first. By the time any such | 2317 | * any node on the zonelist except the first. By the time any such |
2323 | * calls get to this routine, we should just shut up and say 'yes'. | 2318 | * calls get to this routine, we should just shut up and say 'yes'. |
2324 | * | 2319 | * |
2325 | * Unlike the cpuset_zone_allowed_softwall() variant, above, | 2320 | * Unlike the cpuset_node_allowed_softwall() variant, above, |
2326 | * this variant requires that the zone be in the current tasks | 2321 | * this variant requires that the node be in the current task's |
2327 | * mems_allowed or that we're in interrupt. It does not scan up the | 2322 | * mems_allowed or that we're in interrupt. It does not scan up the |
2328 | * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. | 2323 | * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. |
2329 | * It never sleeps. | 2324 | * It never sleeps. |
2330 | */ | 2325 | */ |
2331 | 2326 | int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | |
2332 | int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) | ||
2333 | { | 2327 | { |
2334 | int node; /* node that zone z is on */ | ||
2335 | |||
2336 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2328 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
2337 | return 1; | 2329 | return 1; |
2338 | node = zone_to_nid(z); | ||
2339 | if (node_isset(node, current->mems_allowed)) | 2330 | if (node_isset(node, current->mems_allowed)) |
2340 | return 1; | 2331 | return 1; |
2341 | /* | 2332 | /* |
@@ -2545,15 +2536,9 @@ const struct file_operations proc_cpuset_operations = { | |||
2545 | }; | 2536 | }; |
2546 | #endif /* CONFIG_PROC_PID_CPUSET */ | 2537 | #endif /* CONFIG_PROC_PID_CPUSET */ |
2547 | 2538 | ||
2548 | /* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ | 2539 | /* Display task mems_allowed in /proc/<pid>/status file. */ |
2549 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | 2540 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) |
2550 | { | 2541 | { |
2551 | seq_printf(m, "Cpus_allowed:\t"); | ||
2552 | seq_cpumask(m, &task->cpus_allowed); | ||
2553 | seq_printf(m, "\n"); | ||
2554 | seq_printf(m, "Cpus_allowed_list:\t"); | ||
2555 | seq_cpumask_list(m, &task->cpus_allowed); | ||
2556 | seq_printf(m, "\n"); | ||
2557 | seq_printf(m, "Mems_allowed:\t"); | 2542 | seq_printf(m, "Mems_allowed:\t"); |
2558 | seq_nodemask(m, &task->mems_allowed); | 2543 | seq_nodemask(m, &task->mems_allowed); |
2559 | seq_printf(m, "\n"); | 2544 | seq_printf(m, "\n"); |