diff options
-rw-r--r-- | include/linux/mempolicy.h | 18 | ||||
-rw-r--r-- | kernel/cpuset.c | 90 | ||||
-rw-r--r-- | mm/mempolicy.c | 29 |
3 files changed, 137 insertions, 0 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 74357cb9bc7c..c7ac77e873b3 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -150,6 +150,16 @@ extern void numa_policy_init(void); | |||
150 | extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new); | 150 | extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new); |
151 | extern void mpol_rebind_task(struct task_struct *tsk, | 151 | extern void mpol_rebind_task(struct task_struct *tsk, |
152 | const nodemask_t *new); | 152 | const nodemask_t *new); |
153 | extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); | ||
154 | #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x)) | ||
155 | |||
156 | #ifdef CONFIG_CPUSET | ||
157 | #define current_cpuset_is_being_rebound() \ | ||
158 | (cpuset_being_rebound == current->cpuset) | ||
159 | #else | ||
160 | #define current_cpuset_is_being_rebound() 0 | ||
161 | #endif | ||
162 | |||
153 | extern struct mempolicy default_policy; | 163 | extern struct mempolicy default_policy; |
154 | extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, | 164 | extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, |
155 | unsigned long addr); | 165 | unsigned long addr); |
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k) | |||
165 | int do_migrate_pages(struct mm_struct *mm, | 175 | int do_migrate_pages(struct mm_struct *mm, |
166 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); | 176 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); |
167 | 177 | ||
178 | extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */ | ||
179 | |||
168 | #else | 180 | #else |
169 | 181 | ||
170 | struct mempolicy {}; | 182 | struct mempolicy {}; |
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk, | |||
234 | { | 246 | { |
235 | } | 247 | } |
236 | 248 | ||
249 | static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | ||
250 | { | ||
251 | } | ||
252 | |||
253 | #define set_cpuset_being_rebound(x) do {} while (0) | ||
254 | |||
237 | static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, | 255 | static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, |
238 | unsigned long addr) | 256 | unsigned long addr) |
239 | { | 257 | { |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6004719f26ee..19f87565be17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
812 | } | 812 | } |
813 | 813 | ||
814 | /* | 814 | /* |
815 | * Handle user request to change the 'mems' memory placement | ||
816 | * of a cpuset. Needs to validate the request, update the | ||
817 | * cpusets mems_allowed and mems_generation, and for each | ||
818 | * task in the cpuset, rebind any vma mempolicies. | ||
819 | * | ||
815 | * Call with manage_sem held. May take callback_sem during call. | 820 | * Call with manage_sem held. May take callback_sem during call. |
821 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
822 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
823 | * their mempolicies to the cpusets new mems_allowed. | ||
816 | */ | 824 | */ |
817 | 825 | ||
818 | static int update_nodemask(struct cpuset *cs, char *buf) | 826 | static int update_nodemask(struct cpuset *cs, char *buf) |
819 | { | 827 | { |
820 | struct cpuset trialcs; | 828 | struct cpuset trialcs; |
829 | struct task_struct *g, *p; | ||
830 | struct mm_struct **mmarray; | ||
831 | int i, n, ntasks; | ||
832 | int fudge; | ||
821 | int retval; | 833 | int retval; |
822 | 834 | ||
823 | trialcs = *cs; | 835 | trialcs = *cs; |
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
839 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 851 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
840 | up(&callback_sem); | 852 | up(&callback_sem); |
841 | 853 | ||
854 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | ||
855 | |||
856 | fudge = 10; /* spare mmarray[] slots */ | ||
857 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | ||
858 | retval = -ENOMEM; | ||
859 | |||
860 | /* | ||
861 | * Allocate mmarray[] to hold mm reference for each task | ||
862 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
863 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
864 | * few more lines of code, we can retry until we get a big | ||
865 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
866 | */ | ||
867 | while (1) { | ||
868 | ntasks = atomic_read(&cs->count); /* guess */ | ||
869 | ntasks += fudge; | ||
870 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
871 | if (!mmarray) | ||
872 | goto done; | ||
873 | write_lock_irq(&tasklist_lock); /* block fork */ | ||
874 | if (atomic_read(&cs->count) <= ntasks) | ||
875 | break; /* got enough */ | ||
876 | write_unlock_irq(&tasklist_lock); /* try again */ | ||
877 | kfree(mmarray); | ||
878 | } | ||
879 | |||
880 | n = 0; | ||
881 | |||
882 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
883 | do_each_thread(g, p) { | ||
884 | struct mm_struct *mm; | ||
885 | |||
886 | if (n >= ntasks) { | ||
887 | printk(KERN_WARNING | ||
888 | "Cpuset mempolicy rebind incomplete.\n"); | ||
889 | continue; | ||
890 | } | ||
891 | if (p->cpuset != cs) | ||
892 | continue; | ||
893 | mm = get_task_mm(p); | ||
894 | if (!mm) | ||
895 | continue; | ||
896 | mmarray[n++] = mm; | ||
897 | } while_each_thread(g, p); | ||
898 | write_unlock_irq(&tasklist_lock); | ||
899 | |||
900 | /* | ||
901 | * Now that we've dropped the tasklist spinlock, we can | ||
902 | * rebind the vma mempolicies of each mm in mmarray[] to their | ||
903 | * new cpuset, and release that mm. The mpol_rebind_mm() | ||
904 | * call takes mmap_sem, which we couldn't take while holding | ||
905 | * tasklist_lock. Forks can happen again now - the mpol_copy() | ||
906 | * cpuset_being_rebound check will catch such forks, and rebind | ||
907 | * their vma mempolicies too. Because we still hold the global | ||
908 | * cpuset manage_sem, we know that no other rebind effort will | ||
909 | * be contending for the global variable cpuset_being_rebound. | ||
910 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | ||
911 | * is idempotent. | ||
912 | */ | ||
913 | for (i = 0; i < n; i++) { | ||
914 | struct mm_struct *mm = mmarray[i]; | ||
915 | |||
916 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
917 | mmput(mm); | ||
918 | } | ||
919 | |||
920 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | ||
921 | kfree(mmarray); | ||
922 | set_cpuset_being_rebound(NULL); | ||
923 | retval = 0; | ||
842 | done: | 924 | done: |
843 | return retval; | 925 | return retval; |
844 | } | 926 | } |
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1011 | struct cpuset *oldcs; | 1093 | struct cpuset *oldcs; |
1012 | cpumask_t cpus; | 1094 | cpumask_t cpus; |
1013 | nodemask_t from, to; | 1095 | nodemask_t from, to; |
1096 | struct mm_struct *mm; | ||
1014 | 1097 | ||
1015 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1098 | if (sscanf(pidbuf, "%d", &pid) != 1) |
1016 | return -EIO; | 1099 | return -EIO; |
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1060 | to = cs->mems_allowed; | 1143 | to = cs->mems_allowed; |
1061 | 1144 | ||
1062 | up(&callback_sem); | 1145 | up(&callback_sem); |
1146 | |||
1147 | mm = get_task_mm(tsk); | ||
1148 | if (mm) { | ||
1149 | mpol_rebind_mm(mm, &to); | ||
1150 | mmput(mm); | ||
1151 | } | ||
1152 | |||
1063 | if (is_memory_migrate(cs)) | 1153 | if (is_memory_migrate(cs)) |
1064 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | 1154 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); |
1065 | put_task_struct(tsk); | 1155 | put_task_struct(tsk); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c39bd86f4ea0..1850d0aef4ac 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1131 | } | 1131 | } |
1132 | EXPORT_SYMBOL(alloc_pages_current); | 1132 | EXPORT_SYMBOL(alloc_pages_current); |
1133 | 1133 | ||
1134 | /* | ||
1135 | * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it | ||
1136 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() | ||
1137 | * with the mems_allowed returned by cpuset_mems_allowed(). This | ||
1138 | * keeps mempolicies cpuset relative after its cpuset moves. See | ||
1139 | * further kernel/cpuset.c update_nodemask(). | ||
1140 | */ | ||
1141 | void *cpuset_being_rebound; | ||
1142 | |||
1134 | /* Slow path of a mempolicy copy */ | 1143 | /* Slow path of a mempolicy copy */ |
1135 | struct mempolicy *__mpol_copy(struct mempolicy *old) | 1144 | struct mempolicy *__mpol_copy(struct mempolicy *old) |
1136 | { | 1145 | { |
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1138 | 1147 | ||
1139 | if (!new) | 1148 | if (!new) |
1140 | return ERR_PTR(-ENOMEM); | 1149 | return ERR_PTR(-ENOMEM); |
1150 | if (current_cpuset_is_being_rebound()) { | ||
1151 | nodemask_t mems = cpuset_mems_allowed(current); | ||
1152 | mpol_rebind_policy(old, &mems); | ||
1153 | } | ||
1141 | *new = *old; | 1154 | *new = *old; |
1142 | atomic_set(&new->refcnt, 1); | 1155 | atomic_set(&new->refcnt, 1); |
1143 | if (new->policy == MPOL_BIND) { | 1156 | if (new->policy == MPOL_BIND) { |
@@ -1481,6 +1494,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | |||
1481 | } | 1494 | } |
1482 | 1495 | ||
1483 | /* | 1496 | /* |
1497 | * Rebind each vma in mm to new nodemask. | ||
1498 | * | ||
1499 | * Call holding a reference to mm. Takes mm->mmap_sem during call. | ||
1500 | */ | ||
1501 | |||
1502 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | ||
1503 | { | ||
1504 | struct vm_area_struct *vma; | ||
1505 | |||
1506 | down_write(&mm->mmap_sem); | ||
1507 | for (vma = mm->mmap; vma; vma = vma->vm_next) | ||
1508 | mpol_rebind_policy(vma->vm_policy, new); | ||
1509 | up_write(&mm->mmap_sem); | ||
1510 | } | ||
1511 | |||
1512 | /* | ||
1484 | * Display pages allocated per node and memory policy via /proc. | 1513 | * Display pages allocated per node and memory policy via /proc. |
1485 | */ | 1514 | */ |
1486 | 1515 | ||