diff options
-rw-r--r-- | Documentation/cpusets.txt | 25 | ||||
-rw-r--r-- | include/linux/mempolicy.h | 7 | ||||
-rw-r--r-- | kernel/cpuset.c | 38 |
3 files changed, 68 insertions, 2 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index a09a8eb80665..e2d9afc30d2d 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -192,6 +192,7 @@ containing the following files describing that cpuset: | |||
192 | 192 | ||
193 | - cpus: list of CPUs in that cpuset | 193 | - cpus: list of CPUs in that cpuset |
194 | - mems: list of Memory Nodes in that cpuset | 194 | - mems: list of Memory Nodes in that cpuset |
195 | - memory_migrate flag: if set, move pages to cpusets nodes | ||
195 | - cpu_exclusive flag: is cpu placement exclusive? | 196 | - cpu_exclusive flag: is cpu placement exclusive? |
196 | - mem_exclusive flag: is memory placement exclusive? | 197 | - mem_exclusive flag: is memory placement exclusive? |
197 | - tasks: list of tasks (by pid) attached to that cpuset | 198 | - tasks: list of tasks (by pid) attached to that cpuset |
@@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset. This is done to avoid | |||
277 | impacting the scheduler code in the kernel with a check for changes | 278 | impacting the scheduler code in the kernel with a check for changes |
278 | in a tasks processor placement. | 279 | in a tasks processor placement. |
279 | 280 | ||
281 | Normally, once a page is allocated (given a physical page | ||
282 | of main memory) then that page stays on whatever node it | ||
283 | was allocated, so long as it remains allocated, even if the | ||
284 | cpusets memory placement policy 'mems' subsequently changes. | ||
285 | If the cpuset flag file 'memory_migrate' is set true, then when | ||
286 | tasks are attached to that cpuset, any pages that task had | ||
287 | allocated to it on nodes in its previous cpuset are migrated | ||
288 | to the tasks new cpuset. Depending on the implementation, | ||
289 | this migration may either be done by swapping the page out, | ||
290 | so that the next time the page is referenced, it will be paged | ||
291 | into the tasks new cpuset, usually on the node where it was | ||
292 | referenced, or this migration may be done by directly copying | ||
293 | the pages from the tasks previous cpuset to the new cpuset, | ||
294 | where possible to the same node, relative to the new cpuset, | ||
295 | as the node that held the page, relative to the old cpuset. | ||
296 | Also if 'memory_migrate' is set true, then if that cpusets | ||
297 | 'mems' file is modified, pages allocated to tasks in that | ||
298 | cpuset, that were on nodes in the previous setting of 'mems', | ||
299 | will be moved to nodes in the new setting of 'mems.' Again, | ||
300 | depending on the implementation, this might be done by swapping, | ||
301 | or by direct copying. In either case, pages that were not in | ||
302 | the tasks prior cpuset, or in the cpusets prior 'mems' setting, | ||
303 | will not be moved. | ||
304 | |||
280 | There is an exception to the above. If hotplug functionality is used | 305 | There is an exception to the above. If hotplug functionality is used |
281 | to remove all the CPUs that are currently assigned to a cpuset, | 306 | to remove all the CPUs that are currently assigned to a cpuset, |
282 | then the kernel will automatically update the cpus_allowed of all | 307 | then the kernel will automatically update the cpus_allowed of all |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3e61e829681d..66247eff24a0 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, | |||
235 | return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER); | 235 | return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER); |
236 | } | 236 | } |
237 | 237 | ||
238 | static inline int do_migrate_pages(struct mm_struct *mm, | ||
239 | const nodemask_t *from_nodes, | ||
240 | const nodemask_t *to_nodes, int flags) | ||
241 | { | ||
242 | return 0; | ||
243 | } | ||
244 | |||
238 | static inline void check_highest_zone(int k) | 245 | static inline void check_highest_zone(int k) |
239 | { | 246 | { |
240 | } | 247 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f9816..f63383e01ec7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -87,6 +87,7 @@ struct cpuset { | |||
87 | typedef enum { | 87 | typedef enum { |
88 | CS_CPU_EXCLUSIVE, | 88 | CS_CPU_EXCLUSIVE, |
89 | CS_MEM_EXCLUSIVE, | 89 | CS_MEM_EXCLUSIVE, |
90 | CS_MEMORY_MIGRATE, | ||
90 | CS_REMOVED, | 91 | CS_REMOVED, |
91 | CS_NOTIFY_ON_RELEASE | 92 | CS_NOTIFY_ON_RELEASE |
92 | } cpuset_flagbits_t; | 93 | } cpuset_flagbits_t; |
@@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs) | |||
112 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 113 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
113 | } | 114 | } |
114 | 115 | ||
116 | static inline int is_memory_migrate(const struct cpuset *cs) | ||
117 | { | ||
118 | return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); | ||
119 | } | ||
120 | |||
115 | /* | 121 | /* |
116 | * Increment this atomic integer everytime any cpuset changes its | 122 | * Increment this atomic integer everytime any cpuset changes its |
117 | * mems_allowed value. Users of cpusets can track this generation | 123 | * mems_allowed value. Users of cpusets can track this generation |
@@ -602,16 +608,24 @@ static void refresh_mems(void) | |||
602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { | 608 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { |
603 | struct cpuset *cs; | 609 | struct cpuset *cs; |
604 | nodemask_t oldmem = current->mems_allowed; | 610 | nodemask_t oldmem = current->mems_allowed; |
611 | int migrate; | ||
605 | 612 | ||
606 | down(&callback_sem); | 613 | down(&callback_sem); |
607 | task_lock(current); | 614 | task_lock(current); |
608 | cs = current->cpuset; | 615 | cs = current->cpuset; |
616 | migrate = is_memory_migrate(cs); | ||
609 | guarantee_online_mems(cs, ¤t->mems_allowed); | 617 | guarantee_online_mems(cs, ¤t->mems_allowed); |
610 | current->cpuset_mems_generation = cs->mems_generation; | 618 | current->cpuset_mems_generation = cs->mems_generation; |
611 | task_unlock(current); | 619 | task_unlock(current); |
612 | up(&callback_sem); | 620 | up(&callback_sem); |
613 | if (!nodes_equal(oldmem, current->mems_allowed)) | 621 | if (!nodes_equal(oldmem, current->mems_allowed)) { |
614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | 622 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); |
623 | if (migrate) { | ||
624 | do_migrate_pages(current->mm, &oldmem, | ||
625 | ¤t->mems_allowed, | ||
626 | MPOL_MF_MOVE_ALL); | ||
627 | } | ||
628 | } | ||
615 | } | 629 | } |
616 | } | 630 | } |
617 | 631 | ||
@@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
795 | /* | 809 | /* |
796 | * update_flag - read a 0 or a 1 in a file and update associated flag | 810 | * update_flag - read a 0 or a 1 in a file and update associated flag |
797 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 811 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
798 | * CS_NOTIFY_ON_RELEASE) | 812 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) |
799 | * cs: the cpuset to update | 813 | * cs: the cpuset to update |
800 | * buf: the buffer where we read the 0 or 1 | 814 | * buf: the buffer where we read the 0 or 1 |
801 | * | 815 | * |
@@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
848 | struct task_struct *tsk; | 862 | struct task_struct *tsk; |
849 | struct cpuset *oldcs; | 863 | struct cpuset *oldcs; |
850 | cpumask_t cpus; | 864 | cpumask_t cpus; |
865 | nodemask_t from, to; | ||
851 | 866 | ||
852 | if (sscanf(pidbuf, "%d", &pid) != 1) | 867 | if (sscanf(pidbuf, "%d", &pid) != 1) |
853 | return -EIO; | 868 | return -EIO; |
@@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
893 | guarantee_online_cpus(cs, &cpus); | 908 | guarantee_online_cpus(cs, &cpus); |
894 | set_cpus_allowed(tsk, cpus); | 909 | set_cpus_allowed(tsk, cpus); |
895 | 910 | ||
911 | from = oldcs->mems_allowed; | ||
912 | to = cs->mems_allowed; | ||
913 | |||
896 | up(&callback_sem); | 914 | up(&callback_sem); |
915 | if (is_memory_migrate(cs)) | ||
916 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | ||
897 | put_task_struct(tsk); | 917 | put_task_struct(tsk); |
898 | if (atomic_dec_and_test(&oldcs->count)) | 918 | if (atomic_dec_and_test(&oldcs->count)) |
899 | check_for_release(oldcs, ppathbuf); | 919 | check_for_release(oldcs, ppathbuf); |
@@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
905 | typedef enum { | 925 | typedef enum { |
906 | FILE_ROOT, | 926 | FILE_ROOT, |
907 | FILE_DIR, | 927 | FILE_DIR, |
928 | FILE_MEMORY_MIGRATE, | ||
908 | FILE_CPULIST, | 929 | FILE_CPULIST, |
909 | FILE_MEMLIST, | 930 | FILE_MEMLIST, |
910 | FILE_CPU_EXCLUSIVE, | 931 | FILE_CPU_EXCLUSIVE, |
@@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
960 | case FILE_NOTIFY_ON_RELEASE: | 981 | case FILE_NOTIFY_ON_RELEASE: |
961 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | 982 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); |
962 | break; | 983 | break; |
984 | case FILE_MEMORY_MIGRATE: | ||
985 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | ||
986 | break; | ||
963 | case FILE_TASKLIST: | 987 | case FILE_TASKLIST: |
964 | retval = attach_task(cs, buffer, &pathbuf); | 988 | retval = attach_task(cs, buffer, &pathbuf); |
965 | break; | 989 | break; |
@@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1060 | case FILE_NOTIFY_ON_RELEASE: | 1084 | case FILE_NOTIFY_ON_RELEASE: |
1061 | *s++ = notify_on_release(cs) ? '1' : '0'; | 1085 | *s++ = notify_on_release(cs) ? '1' : '0'; |
1062 | break; | 1086 | break; |
1087 | case FILE_MEMORY_MIGRATE: | ||
1088 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | ||
1089 | break; | ||
1063 | default: | 1090 | default: |
1064 | retval = -EINVAL; | 1091 | retval = -EINVAL; |
1065 | goto out; | 1092 | goto out; |
@@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = { | |||
1408 | .private = FILE_NOTIFY_ON_RELEASE, | 1435 | .private = FILE_NOTIFY_ON_RELEASE, |
1409 | }; | 1436 | }; |
1410 | 1437 | ||
1438 | static struct cftype cft_memory_migrate = { | ||
1439 | .name = "memory_migrate", | ||
1440 | .private = FILE_MEMORY_MIGRATE, | ||
1441 | }; | ||
1442 | |||
1411 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1443 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1412 | { | 1444 | { |
1413 | int err; | 1445 | int err; |
@@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1422 | return err; | 1454 | return err; |
1423 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) | 1455 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) |
1424 | return err; | 1456 | return err; |
1457 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) | ||
1458 | return err; | ||
1425 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1459 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1426 | return err; | 1460 | return err; |
1427 | return 0; | 1461 | return 0; |