aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mempolicy.h18
-rw-r--r--kernel/cpuset.c90
-rw-r--r--mm/mempolicy.c29
3 files changed, 137 insertions, 0 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 74357cb9bc7c..c7ac77e873b3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
150extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new); 150extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
151extern void mpol_rebind_task(struct task_struct *tsk, 151extern void mpol_rebind_task(struct task_struct *tsk,
152 const nodemask_t *new); 152 const nodemask_t *new);
153extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
154#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
155
156#ifdef CONFIG_CPUSET
157#define current_cpuset_is_being_rebound() \
158 (cpuset_being_rebound == current->cpuset)
159#else
160#define current_cpuset_is_being_rebound() 0
161#endif
162
153extern struct mempolicy default_policy; 163extern struct mempolicy default_policy;
154extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 164extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
155 unsigned long addr); 165 unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
165int do_migrate_pages(struct mm_struct *mm, 175int do_migrate_pages(struct mm_struct *mm,
166 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); 176 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
167 177
178extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */
179
168#else 180#else
169 181
170struct mempolicy {}; 182struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
234{ 246{
235} 247}
236 248
249static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
250{
251}
252
253#define set_cpuset_being_rebound(x) do {} while (0)
254
237static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 255static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
238 unsigned long addr) 256 unsigned long addr)
239{ 257{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26ee..19f87565be17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
812} 812}
813 813
814/* 814/*
815 * Handle user request to change the 'mems' memory placement
816 * of a cpuset. Needs to validate the request, update the
817 * cpusets mems_allowed and mems_generation, and for each
818 * task in the cpuset, rebind any vma mempolicies.
819 *
815 * Call with manage_sem held. May take callback_sem during call. 820 * Call with manage_sem held. May take callback_sem during call.
821 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
822 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
823 * their mempolicies to the cpusets new mems_allowed.
816 */ 824 */
817 825
818static int update_nodemask(struct cpuset *cs, char *buf) 826static int update_nodemask(struct cpuset *cs, char *buf)
819{ 827{
820 struct cpuset trialcs; 828 struct cpuset trialcs;
829 struct task_struct *g, *p;
830 struct mm_struct **mmarray;
831 int i, n, ntasks;
832 int fudge;
821 int retval; 833 int retval;
822 834
823 trialcs = *cs; 835 trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
839 cs->mems_generation = atomic_read(&cpuset_mems_generation); 851 cs->mems_generation = atomic_read(&cpuset_mems_generation);
840 up(&callback_sem); 852 up(&callback_sem);
841 853
854 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
855
856 fudge = 10; /* spare mmarray[] slots */
857 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
858 retval = -ENOMEM;
859
860 /*
861 * Allocate mmarray[] to hold mm reference for each task
862 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
863 * tasklist_lock. We could use GFP_ATOMIC, but with a
864 * few more lines of code, we can retry until we get a big
865 * enough mmarray[] w/o using GFP_ATOMIC.
866 */
867 while (1) {
868 ntasks = atomic_read(&cs->count); /* guess */
869 ntasks += fudge;
870 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
871 if (!mmarray)
872 goto done;
873 write_lock_irq(&tasklist_lock); /* block fork */
874 if (atomic_read(&cs->count) <= ntasks)
875 break; /* got enough */
876 write_unlock_irq(&tasklist_lock); /* try again */
877 kfree(mmarray);
878 }
879
880 n = 0;
881
882 /* Load up mmarray[] with mm reference for each task in cpuset. */
883 do_each_thread(g, p) {
884 struct mm_struct *mm;
885
886 if (n >= ntasks) {
887 printk(KERN_WARNING
888 "Cpuset mempolicy rebind incomplete.\n");
889 continue;
890 }
891 if (p->cpuset != cs)
892 continue;
893 mm = get_task_mm(p);
894 if (!mm)
895 continue;
896 mmarray[n++] = mm;
897 } while_each_thread(g, p);
898 write_unlock_irq(&tasklist_lock);
899
900 /*
901 * Now that we've dropped the tasklist spinlock, we can
902 * rebind the vma mempolicies of each mm in mmarray[] to their
903 * new cpuset, and release that mm. The mpol_rebind_mm()
904 * call takes mmap_sem, which we couldn't take while holding
905 * tasklist_lock. Forks can happen again now - the mpol_copy()
906 * cpuset_being_rebound check will catch such forks, and rebind
907 * their vma mempolicies too. Because we still hold the global
908 * cpuset manage_sem, we know that no other rebind effort will
909 * be contending for the global variable cpuset_being_rebound.
910 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
911 * is idempotent.
912 */
913 for (i = 0; i < n; i++) {
914 struct mm_struct *mm = mmarray[i];
915
916 mpol_rebind_mm(mm, &cs->mems_allowed);
917 mmput(mm);
918 }
919
920 /* We're done rebinding vma's to this cpusets new mems_allowed. */
921 kfree(mmarray);
922 set_cpuset_being_rebound(NULL);
923 retval = 0;
842done: 924done:
843 return retval; 925 return retval;
844} 926}
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1011 struct cpuset *oldcs; 1093 struct cpuset *oldcs;
1012 cpumask_t cpus; 1094 cpumask_t cpus;
1013 nodemask_t from, to; 1095 nodemask_t from, to;
1096 struct mm_struct *mm;
1014 1097
1015 if (sscanf(pidbuf, "%d", &pid) != 1) 1098 if (sscanf(pidbuf, "%d", &pid) != 1)
1016 return -EIO; 1099 return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1060 to = cs->mems_allowed; 1143 to = cs->mems_allowed;
1061 1144
1062 up(&callback_sem); 1145 up(&callback_sem);
1146
1147 mm = get_task_mm(tsk);
1148 if (mm) {
1149 mpol_rebind_mm(mm, &to);
1150 mmput(mm);
1151 }
1152
1063 if (is_memory_migrate(cs)) 1153 if (is_memory_migrate(cs))
1064 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); 1154 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1065 put_task_struct(tsk); 1155 put_task_struct(tsk);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c39bd86f4ea0..1850d0aef4ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1131} 1131}
1132EXPORT_SYMBOL(alloc_pages_current); 1132EXPORT_SYMBOL(alloc_pages_current);
1133 1133
1134/*
1135 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1136 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1137 * with the mems_allowed returned by cpuset_mems_allowed(). This
1138 * keeps mempolicies cpuset relative after its cpuset moves. See
1139 * further kernel/cpuset.c update_nodemask().
1140 */
1141void *cpuset_being_rebound;
1142
1134/* Slow path of a mempolicy copy */ 1143/* Slow path of a mempolicy copy */
1135struct mempolicy *__mpol_copy(struct mempolicy *old) 1144struct mempolicy *__mpol_copy(struct mempolicy *old)
1136{ 1145{
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1138 1147
1139 if (!new) 1148 if (!new)
1140 return ERR_PTR(-ENOMEM); 1149 return ERR_PTR(-ENOMEM);
1150 if (current_cpuset_is_being_rebound()) {
1151 nodemask_t mems = cpuset_mems_allowed(current);
1152 mpol_rebind_policy(old, &mems);
1153 }
1141 *new = *old; 1154 *new = *old;
1142 atomic_set(&new->refcnt, 1); 1155 atomic_set(&new->refcnt, 1);
1143 if (new->policy == MPOL_BIND) { 1156 if (new->policy == MPOL_BIND) {
@@ -1481,6 +1494,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1481} 1494}
1482 1495
1483/* 1496/*
1497 * Rebind each vma in mm to new nodemask.
1498 *
1499 * Call holding a reference to mm. Takes mm->mmap_sem during call.
1500 */
1501
1502void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1503{
1504 struct vm_area_struct *vma;
1505
1506 down_write(&mm->mmap_sem);
1507 for (vma = mm->mmap; vma; vma = vma->vm_next)
1508 mpol_rebind_policy(vma->vm_policy, new);
1509 up_write(&mm->mmap_sem);
1510}
1511
1512/*
1484 * Display pages allocated per node and memory policy via /proc. 1513 * Display pages allocated per node and memory policy via /proc.
1485 */ 1514 */
1486 1515