aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2006-01-08 04:01:59 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-08 23:13:44 -0500
commit4225399a66b315d4d1fb1cb61b75dda201c832e3 (patch)
treec8bd976bc6590c5fe859c6129abb93072d99cfa8 /include
parent202f72d5d1b5c2c084f63ef996c736d208b447b5 (diff)
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/mempolicy.h18
1 files changed, 18 insertions, 0 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 74357cb9bc7c..c7ac77e873b3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
150extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new); 150extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
151extern void mpol_rebind_task(struct task_struct *tsk, 151extern void mpol_rebind_task(struct task_struct *tsk,
152 const nodemask_t *new); 152 const nodemask_t *new);
153extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
154#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
155
156#ifdef CONFIG_CPUSET
157#define current_cpuset_is_being_rebound() \
158 (cpuset_being_rebound == current->cpuset)
159#else
160#define current_cpuset_is_being_rebound() 0
161#endif
162
153extern struct mempolicy default_policy; 163extern struct mempolicy default_policy;
154extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 164extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
155 unsigned long addr); 165 unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
165int do_migrate_pages(struct mm_struct *mm, 175int do_migrate_pages(struct mm_struct *mm,
166 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); 176 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
167 177
178extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */
179
168#else 180#else
169 181
170struct mempolicy {}; 182struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
234{ 246{
235} 247}
236 248
249static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
250{
251}
252
253#define set_cpuset_being_rebound(x) do {} while (0)
254
237static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 255static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
238 unsigned long addr) 256 unsigned long addr)
239{ 257{