aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2006-01-08 04:01:59 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-08 23:13:44 -0500
commit4225399a66b315d4d1fb1cb61b75dda201c832e3 (patch)
treec8bd976bc6590c5fe859c6129abb93072d99cfa8
parent202f72d5d1b5c2c084f63ef996c736d208b447b5 (diff)
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/mempolicy.h18
-rw-r--r--kernel/cpuset.c90
-rw-r--r--mm/mempolicy.c29
3 files changed, 137 insertions, 0 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 74357cb9bc7c..c7ac77e873b3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
150extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new); 150extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
151extern void mpol_rebind_task(struct task_struct *tsk, 151extern void mpol_rebind_task(struct task_struct *tsk,
152 const nodemask_t *new); 152 const nodemask_t *new);
153extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
154#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
155
156#ifdef CONFIG_CPUSET
157#define current_cpuset_is_being_rebound() \
158 (cpuset_being_rebound == current->cpuset)
159#else
160#define current_cpuset_is_being_rebound() 0
161#endif
162
153extern struct mempolicy default_policy; 163extern struct mempolicy default_policy;
154extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 164extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
155 unsigned long addr); 165 unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
165int do_migrate_pages(struct mm_struct *mm, 175int do_migrate_pages(struct mm_struct *mm,
166 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); 176 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
167 177
178extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */
179
168#else 180#else
169 181
170struct mempolicy {}; 182struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
234{ 246{
235} 247}
236 248
249static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
250{
251}
252
253#define set_cpuset_being_rebound(x) do {} while (0)
254
237static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 255static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
238 unsigned long addr) 256 unsigned long addr)
239{ 257{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26ee..19f87565be17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
812} 812}
813 813
814/* 814/*
815 * Handle user request to change the 'mems' memory placement
816 * of a cpuset. Needs to validate the request, update the
817 * cpusets mems_allowed and mems_generation, and for each
818 * task in the cpuset, rebind any vma mempolicies.
819 *
815 * Call with manage_sem held. May take callback_sem during call. 820 * Call with manage_sem held. May take callback_sem during call.
821 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
822 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
823 * their mempolicies to the cpusets new mems_allowed.
816 */ 824 */
817 825
818static int update_nodemask(struct cpuset *cs, char *buf) 826static int update_nodemask(struct cpuset *cs, char *buf)
819{ 827{
820 struct cpuset trialcs; 828 struct cpuset trialcs;
829 struct task_struct *g, *p;
830 struct mm_struct **mmarray;
831 int i, n, ntasks;
832 int fudge;
821 int retval; 833 int retval;
822 834
823 trialcs = *cs; 835 trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
839 cs->mems_generation = atomic_read(&cpuset_mems_generation); 851 cs->mems_generation = atomic_read(&cpuset_mems_generation);
840 up(&callback_sem); 852 up(&callback_sem);
841 853
854 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
855
856 fudge = 10; /* spare mmarray[] slots */
857 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
858 retval = -ENOMEM;
859
860 /*
861 * Allocate mmarray[] to hold mm reference for each task
862 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
863 * tasklist_lock. We could use GFP_ATOMIC, but with a
864 * few more lines of code, we can retry until we get a big
865 * enough mmarray[] w/o using GFP_ATOMIC.
866 */
867 while (1) {
868 ntasks = atomic_read(&cs->count); /* guess */
869 ntasks += fudge;
870 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
871 if (!mmarray)
872 goto done;
873 write_lock_irq(&tasklist_lock); /* block fork */
874 if (atomic_read(&cs->count) <= ntasks)
875 break; /* got enough */
876 write_unlock_irq(&tasklist_lock); /* try again */
877 kfree(mmarray);
878 }
879
880 n = 0;
881
882 /* Load up mmarray[] with mm reference for each task in cpuset. */
883 do_each_thread(g, p) {
884 struct mm_struct *mm;
885
886 if (n >= ntasks) {
887 printk(KERN_WARNING
888 "Cpuset mempolicy rebind incomplete.\n");
889 continue;
890 }
891 if (p->cpuset != cs)
892 continue;
893 mm = get_task_mm(p);
894 if (!mm)
895 continue;
896 mmarray[n++] = mm;
897 } while_each_thread(g, p);
898 write_unlock_irq(&tasklist_lock);
899
900 /*
901 * Now that we've dropped the tasklist spinlock, we can
902 * rebind the vma mempolicies of each mm in mmarray[] to their
903 * new cpuset, and release that mm. The mpol_rebind_mm()
904 * call takes mmap_sem, which we couldn't take while holding
905 * tasklist_lock. Forks can happen again now - the mpol_copy()
906 * cpuset_being_rebound check will catch such forks, and rebind
907 * their vma mempolicies too. Because we still hold the global
908 * cpuset manage_sem, we know that no other rebind effort will
909 * be contending for the global variable cpuset_being_rebound.
910 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
911 * is idempotent.
912 */
913 for (i = 0; i < n; i++) {
914 struct mm_struct *mm = mmarray[i];
915
916 mpol_rebind_mm(mm, &cs->mems_allowed);
917 mmput(mm);
918 }
919
920 /* We're done rebinding vma's to this cpusets new mems_allowed. */
921 kfree(mmarray);
922 set_cpuset_being_rebound(NULL);
923 retval = 0;
842done: 924done:
843 return retval; 925 return retval;
844} 926}
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1011 struct cpuset *oldcs; 1093 struct cpuset *oldcs;
1012 cpumask_t cpus; 1094 cpumask_t cpus;
1013 nodemask_t from, to; 1095 nodemask_t from, to;
1096 struct mm_struct *mm;
1014 1097
1015 if (sscanf(pidbuf, "%d", &pid) != 1) 1098 if (sscanf(pidbuf, "%d", &pid) != 1)
1016 return -EIO; 1099 return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1060 to = cs->mems_allowed; 1143 to = cs->mems_allowed;
1061 1144
1062 up(&callback_sem); 1145 up(&callback_sem);
1146
1147 mm = get_task_mm(tsk);
1148 if (mm) {
1149 mpol_rebind_mm(mm, &to);
1150 mmput(mm);
1151 }
1152
1063 if (is_memory_migrate(cs)) 1153 if (is_memory_migrate(cs))
1064 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); 1154 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1065 put_task_struct(tsk); 1155 put_task_struct(tsk);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c39bd86f4ea0..1850d0aef4ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1131} 1131}
1132EXPORT_SYMBOL(alloc_pages_current); 1132EXPORT_SYMBOL(alloc_pages_current);
1133 1133
1134/*
1135 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1136 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1137 * with the mems_allowed returned by cpuset_mems_allowed(). This
1138 * keeps mempolicies cpuset relative after its cpuset moves. See
1139 * further kernel/cpuset.c update_nodemask().
1140 */
1141void *cpuset_being_rebound;
1142
1134/* Slow path of a mempolicy copy */ 1143/* Slow path of a mempolicy copy */
1135struct mempolicy *__mpol_copy(struct mempolicy *old) 1144struct mempolicy *__mpol_copy(struct mempolicy *old)
1136{ 1145{
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1138 1147
1139 if (!new) 1148 if (!new)
1140 return ERR_PTR(-ENOMEM); 1149 return ERR_PTR(-ENOMEM);
1150 if (current_cpuset_is_being_rebound()) {
1151 nodemask_t mems = cpuset_mems_allowed(current);
1152 mpol_rebind_policy(old, &mems);
1153 }
1141 *new = *old; 1154 *new = *old;
1142 atomic_set(&new->refcnt, 1); 1155 atomic_set(&new->refcnt, 1);
1143 if (new->policy == MPOL_BIND) { 1156 if (new->policy == MPOL_BIND) {
@@ -1481,6 +1494,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1481} 1494}
1482 1495
1483/* 1496/*
1497 * Rebind each vma in mm to new nodemask.
1498 *
1499 * Call holding a reference to mm. Takes mm->mmap_sem during call.
1500 */
1501
1502void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1503{
1504 struct vm_area_struct *vma;
1505
1506 down_write(&mm->mmap_sem);
1507 for (vma = mm->mmap; vma; vma = vma->vm_next)
1508 mpol_rebind_policy(vma->vm_policy, new);
1509 up_write(&mm->mmap_sem);
1510}
1511
1512/*
1484 * Display pages allocated per node and memory policy via /proc. 1513 * Display pages allocated per node and memory policy via /proc.
1485 */ 1514 */
1486 1515